diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/Register.h" #include "llvm/Pass.h" @@ -37,22 +38,19 @@ bool IsSet : 1; public: - constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, - bool IsStack = false, bool IsSet = false) - : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} + ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false, + bool IsSet = false) + : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {} - static constexpr ArgDescriptor createRegister(Register Reg, - unsigned Mask = ~0u) { + static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) { return ArgDescriptor(Reg, Mask, false, true); } - static constexpr ArgDescriptor createStack(unsigned Offset, - unsigned Mask = ~0u) { + static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) { return ArgDescriptor(Offset, Mask, true, true); } - static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, - unsigned Mask) { + static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) { return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet); } @@ -94,6 +92,11 @@ return OS; } +struct KernArgPreloadDescriptor : public ArgDescriptor { + KernArgPreloadDescriptor() {} + SmallVector Regs; +}; + struct AMDGPUFunctionArgInfo { enum PreloadedValue { // SGPRS: @@ -151,10 +154,13 @@ ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; + // Map the index of preloaded kernel arguments to its descriptor. + SmallDenseMap PreloadKernArgs{}; + std::tuple getPreloadedValue(PreloadedValue Value) const; - static constexpr AMDGPUFunctionArgInfo fixedABILayout(); + static AMDGPUFunctionArgInfo fixedABILayout(); }; class AMDGPUArgumentUsageInfo : public ImmutablePass { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -60,6 +60,7 @@ return false; } +// TODO: Print preload kernargs? void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { for (const auto &FI : ArgInfoMap) { OS << "Arguments for " << FI.first->getName() << '\n' @@ -148,7 +149,7 @@ llvm_unreachable("unexpected preloaded value type"); } -constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { +AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { AMDGPUFunctionArgInfo AI; AI.PrivateSegmentBuffer = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -219,6 +219,11 @@ if (STM.isAmdHsaOS()) HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); + + if (MF->getInfo()->getNumKernargPreloadedSGPRs() > 0) { + assert(AMDGPU::hasKernargPreload(STM)); + getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI()); + } } void AMDGPUAsmPrinter::emitFunctionBodyEnd() { @@ -436,6 +441,7 @@ const SIProgramInfo &PI) const { const GCNSubtarget &STM = MF.getSubtarget(); const Function &F = MF.getFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); amdhsa::kernel_descriptor_t KernelDescriptor; memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); @@ -459,6 +465,10 @@ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; + if (AMDGPU::hasKernargPreload(STM)) + KernelDescriptor.kernarg_preload = + static_cast(Info->getNumKernargPreloadedSGPRs()); + return KernelDescriptor; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -1023,7 +1023,8 @@ } GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, - const GCNSubtarget &ST) { + const GCNSubtarget &ST) + : ST(ST) { const CallingConv::ID CC = F.getCallingConv(); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; @@ -1064,30 +1065,35 @@ !ST.flatScratchIsArchitected()) { FlatScratchInit = true; } -} -unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const { - unsigned NumUserSGPRs = 0; if (hasImplicitBufferPtr()) - NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); + NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); if (hasPrivateSegmentBuffer()) - NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); if (hasDispatchPtr()) - NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID); + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); if (hasQueuePtr()) - NumUserSGPRs += getNumUserSGPRForField(QueuePtrID); + NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); if (hasKernargSegmentPtr()) - NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); + NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); if (hasDispatchID()) - NumUserSGPRs += getNumUserSGPRForField(DispatchIdID); + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); if (hasFlatScratchInit()) - NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); +} + +void GCNUserSGPRUsageInfo::allocKerargPreloadSGPRs(unsigned NumSGPRs) { + assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); + NumKernargPreloadSGPRs += NumSGPRs; + NumUsedUserSGPRs += NumSGPRs; +} - return NumUserSGPRs; +unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { + return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1385,8 +1385,6 @@ class GCNUserSGPRUsageInfo { public: - unsigned getNumUsedUserSGPRs() const; - bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } @@ -1401,6 +1399,14 @@ bool hasFlatScratchInit() const { return FlatScratchInit; } + unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } + + unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } + + unsigned getNumFreeUserSGPRs(); + + void allocKerargPreloadSGPRs(unsigned NumSGPRs); + enum UserSGPRID : unsigned { ImplicitBufferPtrID = 0, PrivateSegmentBufferID = 1, @@ -1438,6 +1444,8 @@ GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); private: + const GCNSubtarget &ST; + // Private memory buffer // Compute directly in sgpr[0:1] // Other shaders indirect 64-bits at sgpr[0:1] @@ -1454,6 +1462,10 @@ bool DispatchID = false; bool FlatScratchInit = false; + + unsigned NumKernargPreloadSGPRs = 0; + + unsigned NumUsedUserSGPRs = 0; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -90,6 +90,11 @@ /// \returns True on success, false on failure. virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; } + /// \returns True on success, false on failure. + virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) { + return true; + } + virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -154,6 +159,9 @@ /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -215,6 +223,9 @@ /// \returns True on success, false on failure. bool EmitCodeEnd(const MCSubtargetInfo &STI) override; + /// \returns True on success, false on failure. + bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -833,6 +833,24 @@ return true; } +bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + for (int i = 0; i < 64; ++i) { + OS << "\ts_nop 0\n"; + } + return true; +} + +bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader( + const MCSubtargetInfo &STI) { + const uint32_t Encoded_s_nop = 0xbf800000; + MCStreamer &OS = getStreamer(); + for (int i = 0; i < 64; ++i) { + OS.emitInt32(Encoded_s_nop); + } + return true; +} + bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; const uint32_t Encoded_s_nop = 0xbf800000; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -546,6 +546,17 @@ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const; + void allocatePreloadKernArgSGPRs(CCState &CCInfo, + SmallVectorImpl &ArgLocs, + const SmallVectorImpl &Ins, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + + void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const; + void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2232,14 +2232,88 @@ CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be +// sequential starting from the first argument. +void SITargetLowering::allocatePreloadKernArgSGPRs( + CCState &CCInfo, SmallVectorImpl &ArgLocs, + const SmallVectorImpl &Ins, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Function &F = MF.getFunction(); + unsigned LastExplicitArgOffset = + MF.getSubtarget().getExplicitKernelArgOffset(); + GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); + bool InPreloadSequence = true; + unsigned InIdx = 0; + for (auto &Arg : F.args()) { + if (!InPreloadSequence || !Arg.hasInRegAttr()) + break; + + int ArgIdx = Arg.getArgNo(); + // Don't preload non-original args or parts not in the current preload + // sequence. + if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || + (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) + break; + + for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && + (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; + InIdx++) { + assert(ArgLocs[ArgIdx].isMemLoc()); + auto &ArgLoc = ArgLocs[InIdx]; + const Align KernelArgBaseAlign = Align(16); + unsigned ArgOffset = ArgLoc.getLocMemOffset(); + Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); + unsigned NumAllocSGPRs = + alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + + // Arg is preloaded into the previous SGPR. + if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { + Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( + Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); + continue; + } + + unsigned Padding = ArgOffset - LastExplicitArgOffset; + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + // Check for free user SGPRs for preloading. + if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > + SGPRInfo.getNumFreeUserSGPRs()) { + InPreloadSequence = false; + break; + } + + // Preload this argument. + const TargetRegisterClass *RC = + TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); + SmallVectorImpl *PreloadRegs = + Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); + + if (PreloadRegs->size() > 1) + RC = &AMDGPU::SGPR_32RegClass; + for (auto &Reg : *PreloadRegs) { + assert(Reg); + MF.addLiveIn(Reg, RC); + CCInfo.AllocateReg(Reg); + } + + LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; + } + } +} + +void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + // Allways allocate this last since it is a synthetic preload. if (Info.hasLDSKernelId()) { Register Reg = Info.addLDSKernelId(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } - - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read - // these from the dispatch pointer. } // Allocate special input registers that are initialized per-wave. @@ -2545,17 +2619,22 @@ Splits.append(Ins.begin(), Ins.end()); } + if (IsKernel) + analyzeFormalArgumentsCompute(CCInfo, Ins); + if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + if (IsKernel && Subtarget->hasKernargPreload()) + allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); + + allocateLDSKernelId(CCInfo, MF, *TRI, *Info); } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } - if (IsKernel) { - analyzeFormalArgumentsCompute(CCInfo, Ins); - } else { + if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); CCInfo.AnalyzeFormalArguments(Splits, AssignFn); } @@ -2601,9 +2680,81 @@ continue; } - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); - Chains.push_back(Arg.getValue(1)); + SDValue NewArg; + if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { + // In this case the argument is packed into the previous preload SGPR. + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + EVT IntVT = MemVT.changeTypeToInteger(); + + const SIMachineFunctionInfo *Info = + MF.getInfo(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + Register Reg = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; + + assert(Reg); + Register VReg = MRI.getLiveInVirtReg(Reg); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); + NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, + Ins[i].Flags.isSExt(), &Ins[i]); + + NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); + } else { + const SIMachineFunctionInfo *Info = + MF.getInfo(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + const SmallVectorImpl &PreloadRegs = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; + + SDValue Copy; + if (PreloadRegs.size() == 1) { + Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + NewArg = DAG.getCopyFromReg( + Chain, DL, VReg, + EVT::getIntegerVT(*DAG.getContext(), + TRI->getRegSizeInBits(*RC))); + + } else { + // If the kernarg alignment does not match the alignment of the SGPR + // tuple RC that can accommodate this argument, it will be built up + // via copies from from the individual SGPRs that the argument was + // preloaded to. + SmallVector Elts; + for (auto Reg : PreloadRegs) { + Register VReg = MRI.getLiveInVirtReg(Reg); + Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + Elts.push_back(Copy); + } + NewArg = + DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, + PreloadRegs.size()), + DL, Elts); + } + + SDValue CMemVT; + if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType())) + CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg); + else + CMemVT = DAG.getBitcast(MemVT, NewArg); + NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT, + Ins[i].Flags.isSExt(), &Ins[i]); + NewArg = DAG.getMergeValues({NewArg, Chain}, DL); + } + } else { + NewArg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, + Alignment, Ins[i].Flags.isSExt(), &Ins[i]); + } + Chains.push_back(NewArg.getValue(1)); auto *ParamTy = dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -2613,11 +2764,11 @@ // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); + NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, + DAG.getValueType(MVT::i16)); } - InVals.push_back(Arg); + InVals.push_back(NewArg); continue; } else if (!IsEntryFunc && VA.isMemLoc()) { SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -21,6 +21,7 @@ #include "SIInstrInfo.h" #include "SIModeRegisterDefaults.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/Support/raw_ostream.h" @@ -593,6 +594,8 @@ return PrologEpilogSGPRSpills; } + GCNUserSGPRUsageInfo &getUserSGPRInfo() { return UserSGPRInfo; } + const GCNUserSGPRUsageInfo &getUserSGPRInfo() const { return UserSGPRInfo; } void addToPrologEpilogSGPRSpills(Register Reg, @@ -727,6 +730,10 @@ Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); Register addLDSKernelId(); + SmallVectorImpl * + addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, + int PaddingSGPRs); /// Increment user SGPRs used for padding the argument list only. Register addReservedUserSGPR() { @@ -872,6 +879,10 @@ return NumUserSGPRs + NumSystemSGPRs; } + unsigned getNumKernargPreloadedSGPRs() const { + return UserSGPRInfo.getNumKernargPreloadSGPRs(); + } + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -243,6 +243,31 @@ return ArgInfo.LDSKernelId.getRegister(); } +SmallVectorImpl *SIMachineFunctionInfo::addPreloadedKernArg( + const SIRegisterInfo &TRI, const TargetRegisterClass *RC, + unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) { + assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) && + "Preload kernel argument allocated twice."); + NumUserSGPRs += PaddingSGPRs; + // If the available register tuples are aligned with the kernarg to be + // preloaded use that register, otherwise we need to use a set of SGPRs and + // merge them. + if (Register PreloadReg = + TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC)) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg); + NumUserSGPRs += AllocSizeDWord; + } else { + for (unsigned I = 0; I < AllocSizeDWord; ++I) { + ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR()); + NumUserSGPRs++; + } + } + + // Track the actual number of SGPRs that HW will preload to. + UserSGPRInfo.allocKerargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs); + return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs; +} + void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size, Align Alignment) { // Skip if it is an entry function or the register is already added. @@ -570,6 +595,7 @@ return true; }; + // TODO: Need to serialize kernarg preloads. bool Any = false; Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer); Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -22,7 +22,7 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -69,7 +69,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -105,7 +105,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -152,7 +152,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -188,7 +188,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -235,7 +235,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -271,7 +271,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -318,7 +318,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -354,7 +354,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -401,7 +401,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -437,7 +437,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -484,7 +484,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -520,7 +520,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -567,7 +567,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -603,7 +603,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -650,7 +650,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -686,7 +686,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -733,7 +733,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -769,7 +769,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -816,7 +816,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -852,7 +852,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -899,7 +899,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -935,7 +935,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -982,7 +982,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -541,7 +541,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.swap: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -563,7 +563,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -585,7 +585,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.sub: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -607,7 +607,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -629,7 +629,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.smax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -651,7 +651,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -673,7 +673,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.umax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -695,7 +695,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.and: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -717,7 +717,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.or: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -739,7 +739,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.xor: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -761,7 +761,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.inc: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -783,7 +783,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.dec: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -805,7 +805,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.cmpswap: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -828,7 +828,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fadd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -852,7 +852,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 @@ -877,7 +877,7 @@ ret void } -define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { +define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.ptr.atomic.fmax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -24,7 +24,7 @@ declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1) -define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -71,7 +71,7 @@ ret void } -define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -107,7 +107,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -154,7 +154,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -190,7 +190,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -237,7 +237,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -273,7 +273,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -320,7 +320,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -356,7 +356,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -403,7 +403,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -439,7 +439,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -486,7 +486,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -522,7 +522,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -569,7 +569,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -605,7 +605,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -652,7 +652,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -688,7 +688,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -735,7 +735,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -771,7 +771,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -818,7 +818,7 @@ ret void } -define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -854,7 +854,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -901,7 +901,7 @@ ret void } -define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -937,7 +937,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -984,7 +984,7 @@ ret void } -define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -1020,7 +1020,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 @@ -1067,7 +1067,7 @@ ret void } -define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { +define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) { ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx90a --disassemble - | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN %s + +; GCN: preload_kernarg_header +; GCN-COUNT-64: s_nop 0 +define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { + store ptr %arg, ptr %arg + ret void +} + +; GCN: non_kernel_function +; GCN-NOT: s_nop 0 +; GCN: flat_store +define void @non_kernel_function(ptr %arg) { + store ptr %arg, ptr %arg + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -0,0 +1,5418 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=NO-PRELOAD %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-1 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-2 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-4 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRELOAD-8 %s + +define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i8: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i8: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i8: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i8: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i8: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) { +; NO-PRELOAD-LABEL: ptr1_i8_zext_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i8_zext_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i8_zext_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i8_zext_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i8_zext_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xffff +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i16 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) { +; NO-PRELOAD-LABEL: ptr1_i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store i32 %arg0, ptr addrspace(1) %out + ret void +} + +; Check alignment on the second preloaded arg. + +define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dword s3, s[4:5], 0x0 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_add_i32 s2, s3, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_add_i32 s2, s6, s2 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s2 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[0:1] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: s_add_i32 s0, s6, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_add_i32 s0, s6, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[8:9] +; PRELOAD-8-NEXT: s_endpgm + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) { +; NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: s_lshr_b32 s3, s2, 16 +; NO-PRELOAD-NEXT: s_and_b32 s2, s2, 0xffff +; NO-PRELOAD-NEXT: s_add_i32 s2, s2, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: s_lshr_b32 s1, s0, 16 +; PRELOAD-1-NEXT: s_and_b32 s0, s0, 0xffff +; PRELOAD-1-NEXT: s_add_i32 s0, s0, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: s_lshr_b32 s0, s0, 16 +; PRELOAD-2-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-4-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff +; PRELOAD-8-NEXT: s_add_i32 s0, s1, s0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) { +; NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dword s2, s[4:5], 0x8 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s2 +; NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: global_store_short v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-2-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-4-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-8-NEXT: global_store_short v1, v0, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <2 x i8> %in, ptr addrspace(1) %out + ret void +} + +; Don't try to preload byref args. + +define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: byref_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s1 +; NO-PRELOAD-NEXT: global_store_dword v0, v1, s[2:3] +; NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[2:3] +; NO-PRELOAD-NEXT: s_waitcnt vmcnt(0) +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: byref_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-1-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: byref_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-2-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-2-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: byref_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-4-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-4-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: byref_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-8-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-8-NEXT: s_waitcnt vmcnt(0) +; PRELOAD-8-NEXT: s_endpgm + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +; TODO: Should do partial preload in cases like these where only part of the arg +; can be preloaded. + +define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { +; NO-PRELOAD-LABEL: v8i32_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; NO-PRELOAD-NEXT: s_nop 0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v8i32_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v8i32_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v8i32_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v8i32_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <8 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { +; NO-PRELOAD-LABEL: v3i16_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3i16_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s0 +; PRELOAD-1-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3i16_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-2-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3i16_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-4-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3i16_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 +; PRELOAD-8-NEXT: global_store_dword v0, v1, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x i16> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { +; NO-PRELOAD-LABEL: v3i32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3i32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3i32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3i32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3i32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { +; NO-PRELOAD-LABEL: v3f32_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s1 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v3f32_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s2 +; PRELOAD-1-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v3f32_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-2-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v3f32_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-4-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v3f32_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s12 +; PRELOAD-8-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <3 x float> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { +; NO-PRELOAD-LABEL: v5i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 +; NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v5i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s1 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s0 +; PRELOAD-1-NEXT: global_store_byte v0, v1, s[6:7] offset:4 +; PRELOAD-1-NEXT: global_store_dword v0, v2, s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v5i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-2-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-2-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v5i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-4-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-4-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v5i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s9 +; PRELOAD-8-NEXT: global_store_byte v1, v2, s[6:7] offset:4 +; PRELOAD-8-NEXT: global_store_dword v1, v0, s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <5 x i8> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { +; NO-PRELOAD-LABEL: v5f64_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; NO-PRELOAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; NO-PRELOAD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v4, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s12 +; NO-PRELOAD-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s13 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s14 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s15 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 +; NO-PRELOAD-NEXT: s_nop 0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s8 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s9 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s10 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v3, s11 +; NO-PRELOAD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v5f64_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-1-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-1-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-1-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v5f64_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-2-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-2-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-2-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-2-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v5f64_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-4-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-4-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-4-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-4-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-4-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v5f64_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 +; PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 +; PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-8-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s12 +; PRELOAD-8-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s13 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s14 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s15 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: v_mov_b32_e32 v0, s8 +; PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, s10 +; PRELOAD-8-NEXT: v_mov_b32_e32 v3, s11 +; PRELOAD-8-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <5 x double> %in, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) { +; NO-PRELOAD-LABEL: v8i8_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: v8i8_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: v8i8_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-2-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-2-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: v8i8_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-4-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-4-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: v8i8_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 +; PRELOAD-8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 24 +; PRELOAD-8-NEXT: v_lshlrev_b16_e64 v2, 8, s0 +; PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store <8 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) { +; NO-PRELOAD-LABEL: i64_kernel_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: i64_kernel_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: i64_kernel_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: i64_kernel_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: i64_kernel_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store i64 %a, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double %in) { +; NO-PRELOAD-LABEL: f64_kernel_preload_arg: +; NO-PRELOAD: ; %bb.0: +; NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 +; NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) +; NO-PRELOAD-NEXT: v_mov_b32_e32 v0, s2 +; NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 +; NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; NO-PRELOAD-NEXT: s_endpgm +; +; PRELOAD-1-LABEL: f64_kernel_preload_arg: +; PRELOAD-1: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: s_nop 0 +; PRELOAD-1-NEXT: ; %bb.0: +; PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-1-NEXT: s_waitcnt lgkmcnt(0) +; PRELOAD-1-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; PRELOAD-1-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-1-NEXT: s_endpgm +; +; PRELOAD-2-LABEL: f64_kernel_preload_arg: +; PRELOAD-2: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: s_nop 0 +; PRELOAD-2-NEXT: ; %bb.0: +; PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-2-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-2-NEXT: s_endpgm +; +; PRELOAD-4-LABEL: f64_kernel_preload_arg: +; PRELOAD-4: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: s_nop 0 +; PRELOAD-4-NEXT: ; %bb.0: +; PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-4-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-4-NEXT: s_endpgm +; +; PRELOAD-8-LABEL: f64_kernel_preload_arg: +; PRELOAD-8: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: s_nop 0 +; PRELOAD-8-NEXT: ; %bb.0: +; PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 +; PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] +; PRELOAD-8-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; PRELOAD-8-NEXT: s_endpgm + store double %in, ptr addrspace(1) %out + ret void +}