diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" #include "llvm/Pass.h" @@ -37,22 +38,19 @@ bool IsSet : 1; public: - constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, - bool IsStack = false, bool IsSet = false) - : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, + bool IsSet = false) + : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} - static constexpr ArgDescriptor createRegister(Register Reg, - unsigned Mask = ~0u) { + static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { return ArgDescriptor(Reg, Mask, false, true); } - static constexpr ArgDescriptor createStack(unsigned Offset, - unsigned Mask = ~0u) { + static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { return ArgDescriptor(Offset, Mask, true, true); } - static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, - unsigned Mask) { + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } @@ -94,6 +92,11 @@ return OS; } +struct KernArgPreloadDescriptor : public ArgDescriptor { + KernArgPreloadDescriptor() {} + SmallVector Regs; +}; + struct AMDGPUFunctionArgInfo { enum PreloadedValue { // SGPRS: @@ -151,10 +154,13 @@ ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; + // Map the index of preloaded kernel arguments to its descriptor. + SmallDenseMap PreloadKernArgs{}; + std::tuple getPreloadedValue(PreloadedValue Value) const; - static constexpr AMDGPUFunctionArgInfo fixedABILayout(); + static AMDGPUFunctionArgInfo fixedABILayout(); }; class AMDGPUArgumentUsageInfo : public ImmutablePass { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -60,6 +60,7 @@ return false; } +// TODO: Print preload kernargs? void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { for (const auto &FI : ArgInfoMap) { OS << "Arguments for " << FI.first->getName() << '\n' @@ -148,7 +149,7 @@ llvm_unreachable("unexpected preloaded value type"); } -constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { +AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { AMDGPUFunctionArgInfo AI; AI.PrivateSegmentBuffer = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -219,6 +219,10 @@ if (STM.isAmdHsaOS()) HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); + + if (MF->getInfo()->getNumKernargPreloadedSGPRs() > 0) { + getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI()); + } } void AMDGPUAsmPrinter::emitFunctionBodyEnd() { @@ -436,6 +440,7 @@ const SIProgramInfo &PI) const { const GCNSubtarget &STM = MF.getSubtarget(); const Function &F = MF.getFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); amdhsa::kernel_descriptor_t KernelDescriptor; memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); @@ -459,6 +464,10 @@ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; + if (STM.hasGFX90AInsts()) + KernelDescriptor.kernarg_preload = + static_cast(Info->getNumKernargPreloadedSGPRs()); + return KernelDescriptor; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -169,10 +169,10 @@ ConstantInt::get(Builder.getInt32Ty(), PreloadSGPRs)); PreloadInfo.KernelArgMetadata.push_back( MDNode::get(Ctx, {MDIndex, MDAllocSizeSGPRs})); + continue; } - } else { - InPreloadSequence = false; } + InPreloadSequence = false; // If this is byval, the loads are already explicit in the function. We just // need to rewrite the pointer values. diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -90,6 +90,11 @@ /// \returns True on success, false on failure. virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; } + /// \returns True on success, false on failure. + virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) { + return true; + } + virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -154,6 +159,9 @@ /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -215,6 +223,9 @@ /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -833,6 +833,24 @@ return true; } +bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + for (int i = 0; i < 64; ++i) { + OS << "\ts_nop 0\n"; + } + return true; +} + +bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + const uint32_t Encoded_s_nop = 0xbf800000; + MCStreamer &OS = getStreamer(); + for (int i = 0; i < 64; ++i) { + OS.emitInt32(Encoded_s_nop); + } + return true; +} + bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; const uint32_t Encoded_s_nop = 0xbf800000; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -546,6 +546,17 @@ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + void allocatePreloadKernArgSGPRs(CCState &CCInfo, + SmallVectorImpl &ArgLocs, + const SmallVectorImpl &Ins, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2228,14 +2228,93 @@ CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +void SITargetLowering::allocatePreloadKernArgSGPRs( + CCState &CCInfo, SmallVectorImpl &ArgLocs, + const SmallVectorImpl &Ins, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + // Allocate pre-loaded kernel arguemtns. + const Function &F = MF.getFunction(); + MDNode *MD = F.getMetadata("preload_kernel_args"); + if (!MD) + return; + + if (!dyn_cast(MD->operands().begin()->get())) + return; + +#ifndef NDEBUG + unsigned LastIdx = 0; +#endif + + unsigned InIdx = 0; + unsigned LastExplicitArgOffset = + MF.getSubtarget().getExplicitKernelArgOffset(); + for (auto &N : MD->operands()) { + auto *ArgNode = cast(N.get()); + assert(ArgNode && ArgNode->getNumOperands() == 2); + unsigned ArgIdx = + mdconst::extract(ArgNode->getOperand(0))->getZExtValue(); + + while (InIdx < Ins.size() && + (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx)) + InIdx++; + + for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && + Ins[InIdx].getOrigArgIndex() == ArgIdx; + InIdx++) { +#ifndef NDEBUG + // Verify sequential. + if (LastIdx != 0) + assert(LastIdx + 1 == InIdx); + LastIdx = InIdx; +#endif + + assert(ArgLocs[ArgIdx].isMemLoc()); + auto &ArgLoc = ArgLocs[InIdx]; + const Align KernelArgBaseAlign = Align(16); + unsigned ArgOffset = ArgLoc.getLocMemOffset(); + Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); + unsigned NumAllocSGPRs = + alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + + // Arg is preloaded into the previous SGPR. + if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { + Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( + Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); + continue; + } + + unsigned Padding = ArgOffset - LastExplicitArgOffset; + const TargetRegisterClass *RC = + TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); + SmallVectorImpl *PreloadRegs = + Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, Padding); + + if (PreloadRegs->size() > 1) + RC = &AMDGPU::SGPR_32RegClass; + for (auto &Reg : *PreloadRegs) { + assert(Reg); + MF.addLiveIn(Reg, RC); + CCInfo.AllocateReg(Reg); + } + + LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; + } + } +} + +void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + // Allways allocate this last since it is a synthetic preload. if (Info.hasLDSKernelId()) { Register Reg = Info.addLDSKernelId(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } - - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read - // these from the dispatch pointer. } // Allocate special input registers that are initialized per-wave. @@ -2541,17 +2620,20 @@ Splits.append(Ins.begin(), Ins.end()); } + if (IsKernel) + analyzeFormalArgumentsCompute(CCInfo, Ins); + if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); + allocateLDSKernelId(CCInfo, MF, *TRI, *Info); } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } - if (IsKernel) { - analyzeFormalArgumentsCompute(CCInfo, Ins); - } else { + if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); } @@ -2597,9 +2679,87 @@ continue; } - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); - Chains.push_back(Arg.getValue(1)); + SDValue NewArg; + if (Arg.isOrigArg() && + Info->getArgInfo().PreloadKernArgs.count(Arg.getOrigArgIndex())) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { + // In this case the argument is packed into the previous preload SGPR. + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + EVT IntVT = MemVT.changeTypeToInteger(); + + const SIMachineFunctionInfo *Info = + MF.getInfo(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + Register Reg = Info->getArgInfo() + .PreloadKernArgs.find(ArgIdx - 1) + ->getSecond() + .Regs[0]; + + assert(Reg); + Register VReg = MRI.getLiveInVirtReg(Reg); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); + NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, + Ins[i].Flags.isSExt(), &Ins[i]); + + NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); + } else { + const SIMachineFunctionInfo *Info = + MF.getInfo(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + const SmallVectorImpl &PreloadRegs = + Info->getArgInfo() + .PreloadKernArgs.find(ArgIdx - 1) + ->getSecond() + .Regs; + + SDValue Copy; + if (PreloadRegs.size() == 1) { + Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + NewArg = DAG.getCopyFromReg( + Chain, DL, VReg, + EVT::getIntegerVT(*DAG.getContext(), + TRI->getRegSizeInBits(*RC))); + + } else { + // If the kernarg alignment does not match the alignment of the SGPR + // tuple RC that can accommodate this argument, it will be built up + // via copies from from the individual SGPRs that the argument was + // preloaded to. + SmallVector Elts; + for (auto Reg : PreloadRegs) { + Register VReg = MRI.getLiveInVirtReg(Reg); + Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + Elts.push_back(Copy); + } + NewArg = + DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, + PreloadRegs.size()), + DL, Elts); + } + + SDValue CMemVT; + if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType())) + CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg); + else + CMemVT = DAG.getBitcast(MemVT, NewArg); + NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT, + Ins[i].Flags.isSExt(), &Ins[i]); + NewArg = DAG.getMergeValues({NewArg, Chain}, DL); + } + } else { + NewArg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, + Alignment, Ins[i].Flags.isSExt(), &Ins[i]); + } + Chains.push_back(NewArg.getValue(1)); auto *ParamTy = dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -2609,11 +2769,11 @@ // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); + NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, + DAG.getValueType(MVT::i16)); } - InVals.push_back(Arg); + InVals.push_back(NewArg); continue; } else if (!IsEntryFunc && VA.isMemLoc()) { SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -21,6 +21,7 @@ #include "SIInstrInfo.h" #include "SIModeRegisterDefaults.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/raw_ostream.h" @@ -428,6 +429,7 @@ private: unsigned NumUserSGPRs = 0; unsigned NumSystemSGPRs = 0; + unsigned NumKernargPreloadedSGPRs = 0; bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; @@ -727,6 +729,9 @@ Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); Register addLDSKernelId(); + SmallVectorImpl * + addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, int Padding); /// Increment user SGPRs used for padding the argument list only. Register addReservedUserSGPR() { @@ -872,6 +877,10 @@ return NumUserSGPRs + NumSystemSGPRs; } + unsigned getNumKernargPreloadedSGPRs() const { + return NumKernargPreloadedSGPRs; + } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -243,6 +243,33 @@ return ArgInfo.LDSKernelId.getRegister(); } +SmallVectorImpl *SIMachineFunctionInfo::addPreloadedKernArg( + const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, int Padding) { + assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && + "Preload kernel argument allocated twice."); + + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + NumUserSGPRs += PaddingSGPRs; + // If the available register tuples are aligned with the kernarg to be + // preloaded use that register, otherwise we need to use a set of SGPRs and + // merge them. + if (Register PreloadReg = + TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC)) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg); + NumUserSGPRs += AllocSizeDWord; + } else { + for (unsigned I = 0; I < AllocSizeDWord; ++I) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR()); + NumUserSGPRs++; + } + } + + // Track the actual number of SGPRs that HW will preload to. + NumKernargPreloadedSGPRs += AllocSizeDWord + PaddingSGPRs; + return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; +} + void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size, Align Alignment) { // Skip if it is an entry function or the register is already added. @@ -570,6 +597,7 @@ return true; }; + // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx90a --disassemble - | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s + +; GCN: preload_kernarg_header +; GCN-COUNT-64: s_nop 0 +define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { + store ptr %arg, ptr %arg + ret void +} + +; GCN: non_kernel_function +; GCN-NOT: s_nop 0 +; GCN: flat_store +define void @non_kernel_function(ptr %arg) { + store ptr %arg, ptr %arg + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-metadata.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-metadata.ll --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-metadata.ll @@ -21,34 +21,24 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 ; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in @@ -77,52 +67,36 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !5 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in @@ -165,8 +139,6 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 @@ -181,7 +153,7 @@ ; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 ; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 @@ -192,27 +164,21 @@ ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 -; PRELOAD-3-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 ; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !5 ; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !5 ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !5 ; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !5 ; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !5 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 ; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 @@ -221,29 +187,19 @@ ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !7 { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 -; PRELOAD-8-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-8-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 ; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !8 ; PRELOAD-8-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-8-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !8 ; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !8 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 +; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 @@ -283,15 +239,13 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 @@ -300,35 +254,19 @@ ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in @@ -361,15 +299,13 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence ; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 @@ -378,35 +314,19 @@ ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence ; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 ; PRELOAD-8-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in @@ -442,9 +362,6 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned ; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 -; PRELOAD-1-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !2 ; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 @@ -455,7 +372,7 @@ ; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !2 ; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 @@ -464,20 +381,13 @@ ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned ; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !8 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 -; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !3 ; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !5 ; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !5 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] ; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 @@ -486,22 +396,13 @@ ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned ; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !9 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 -; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 ; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !8 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 ; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; PRELOAD-8-NEXT: ret void ; @@ -538,17 +439,14 @@ ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 ; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { ; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 -; PRELOAD-1-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 -; PRELOAD-1-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 -; PRELOAD-1-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; PRELOAD-1-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 ; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 ; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 -; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32 ; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] ; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; PRELOAD-1-NEXT: ret void @@ -556,37 +454,19 @@ ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 ; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !10 { ; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; PRELOAD-3-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-3-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-3-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 -; PRELOAD-3-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 -; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 ; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 ; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !11 { ; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; PRELOAD-8-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-8-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 -; PRELOAD-8-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 -; PRELOAD-8-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 -; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 -; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 ; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-8-NEXT: ret void ; %ext = zext i16 %arg0 to i32 @@ -597,7 +477,6 @@ } attributes #0 = { nounwind } -;. ; NO-PRELOAD: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } ; NO-PRELOAD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. @@ -621,27 +500,27 @@ ; PRELOAD-3: [[META0:![0-9]+]] = !{!1, !2} ; PRELOAD-3: [[META1:![0-9]+]] = !{i32 0, i32 2} ; PRELOAD-3: [[META2:![0-9]+]] = !{i32 1, i32 2} -; PRELOAD-3: [[META3:![0-9]+]] = !{} -; PRELOAD-3: [[META4:![0-9]+]] = !{!1, !2, !5} -; PRELOAD-3: [[META5:![0-9]+]] = !{i32 2, i32 2} -; PRELOAD-3: [[META6:![0-9]+]] = !{!1, !2, !5, !7} +; PRELOAD-3: [[META3:![0-9]+]] = !{!1, !2, !4} +; PRELOAD-3: [[META4:![0-9]+]] = !{i32 2, i32 2} +; PRELOAD-3: [[META5:![0-9]+]] = !{} +; PRELOAD-3: [[META6:![0-9]+]] = !{!1, !2, !4, !7} ; PRELOAD-3: [[META7:![0-9]+]] = !{i32 3, i32 2} -; PRELOAD-3: [[META8:![0-9]+]] = !{!9, !2, !5} +; PRELOAD-3: [[META8:![0-9]+]] = !{!9, !2, !4} ; PRELOAD-3: [[META9:![0-9]+]] = !{i32 0, i32 1} -; PRELOAD-3: [[META10:![0-9]+]] = !{!9, !11, !5} +; PRELOAD-3: [[META10:![0-9]+]] = !{!9, !11, !4} ; PRELOAD-3: [[META11:![0-9]+]] = !{i32 1, i32 1} ;. ; PRELOAD-8: [[META0:![0-9]+]] = !{!1, !2} ; PRELOAD-8: [[META1:![0-9]+]] = !{i32 0, i32 2} ; PRELOAD-8: [[META2:![0-9]+]] = !{i32 1, i32 2} -; PRELOAD-8: [[META3:![0-9]+]] = !{} -; PRELOAD-8: [[META4:![0-9]+]] = !{!1, !2, !5, !6} -; PRELOAD-8: [[META5:![0-9]+]] = !{i32 2, i32 2} -; PRELOAD-8: [[META6:![0-9]+]] = !{i32 3, i32 2} -; PRELOAD-8: [[META7:![0-9]+]] = !{!1, !2, !5, !6, !8} -; PRELOAD-8: [[META8:![0-9]+]] = !{i32 4, i32 2} -; PRELOAD-8: [[META9:![0-9]+]] = !{!10, !2, !5, !6} +; PRELOAD-8: [[META3:![0-9]+]] = !{!1, !2, !4, !5} +; PRELOAD-8: [[META4:![0-9]+]] = !{i32 2, i32 2} +; PRELOAD-8: [[META5:![0-9]+]] = !{i32 3, i32 2} +; PRELOAD-8: [[META6:![0-9]+]] = !{!1, !2, !4, !5, !7} +; PRELOAD-8: [[META7:![0-9]+]] = !{i32 4, i32 2} +; PRELOAD-8: [[META8:![0-9]+]] = !{} +; PRELOAD-8: [[META9:![0-9]+]] = !{!10, !2, !4, !5} ; PRELOAD-8: [[META10:![0-9]+]] = !{i32 0, i32 1} -; PRELOAD-8: [[META11:![0-9]+]] = !{!10, !12, !5} +; PRELOAD-8: [[META11:![0-9]+]] = !{!10, !12, !4} ; PRELOAD-8: [[META12:![0-9]+]] = !{i32 1, i32 1} ;. diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -0,0 +1,5418 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s + +define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i8: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i8: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i8: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i8: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i8: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) { +; NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i8_zext_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i8_zext_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xffff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i16 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store i32 %arg0, ptr addrspace(1) %out + ret void +} + +; Check alignment on the second preloaded arg. + +define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_add_i32 s2, s3, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_add_i32 s2, s6, s2 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_add_i32 s0, s6, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-8-NEXT: s_endpgm + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) { +; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_lshr_b32 s3, s2, 16 +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff +; NO-PRELOAD-NEXT: s_add_i32 s2, s2, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_lshr_b32 s1, s0, 16 +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xffff +; PRELOAD-1-NEXT: s_add_i32 s0, s0, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) { +; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_short v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-4-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <2 x i8> %in, ptr addrspace(1) %out + ret void +} + +; Don't try to preload byref args. + +define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: byref_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s1 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] +; NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] +; NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: byref_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: byref_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-2-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: byref_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-4-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: byref_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-8-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-8-NEXT: s_endpgm + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +; TODO: Should do partial preload in cases like these where only part of the arg +; can be preloaded. + +define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { +; NO-PRELOAD-LABEL: v8i32_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NO-PRELOAD-NEXT: s_nop 0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v8i32_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v8i32_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v8i32_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v8i32_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <8 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { +; NO-PRELOAD-LABEL: v3i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s0 +; PRELOAD-1-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x i16> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { +; NO-PRELOAD-LABEL: v3i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { +; NO-PRELOAD-LABEL: v3f32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3f32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3f32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3f32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3f32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x float> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { +; NO-PRELOAD-LABEL: v5i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v5i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s0 +; PRELOAD-1-NEXT: global_store_byte v0, v1, s[6:7] offset:4 +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v5i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v5i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-4-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-4-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v5i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <5 x i8> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { +; NO-PRELOAD-LABEL: v5f64_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12 +; NO-PRELOAD-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; NO-PRELOAD-NEXT: s_nop 0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v5f64_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v5f64_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v5f64_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v5f64_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <5 x double> %in, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) { +; NO-PRELOAD-LABEL: v8i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v8i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v8i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v8i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v8i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <8 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) { +; NO-PRELOAD-LABEL: i64_kernel_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: i64_kernel_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: i64_kernel_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: i64_kernel_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: i64_kernel_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store i64 %a, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) { +; NO-PRELOAD-LABEL: f64_kernel_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: f64_kernel_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: f64_kernel_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: f64_kernel_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: f64_kernel_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store double %in, ptr addrspace(1) %out + ret void +}