Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -23,8 +23,8 @@ class MachineInstrBuilder; class AMDGPUCallLowering: public CallLowering { - Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, - uint64_t Offset) const; + void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, + uint64_t Offset) const; void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, Align Alignment, Register DstReg) const; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -397,24 +397,19 @@ return true; } -Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, - Type *ParamTy, - uint64_t Offset) const { - +void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, + Type *ParamTy, + uint64_t Offset) const { MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - const DataLayout &DL = F.getParent()->getDataLayout(); - PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); - LLT PtrType = getLLTForType(*PtrTy, DL); Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); - return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); + B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); } void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, @@ -425,7 +420,10 @@ const DataLayout &DL = F.getParent()->getDataLayout(); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + lowerParameterPtr(PtrReg, B, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, @@ -512,26 +510,47 @@ // TODO: Align down to dword alignment and extract bits for extending loads. for (auto &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByVal = Arg.hasByValAttr(); + Type *ArgTy = IsByVal ? Arg.getParamByValType() : Arg.getType(); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); if (AllocSize == 0) continue; - unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + unsigned ABIAlign = IsByVal ? Arg.getParamAlignment() : 0; + if (ABIAlign == 0) + ABIAlign = DL.getABITypeAlignment(ArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; - ArrayRef OrigArgRegs = VRegs[i]; - Register ArgReg = - OrigArgRegs.size() == 1 - ? OrigArgRegs[0] - : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); - lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); - if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + + if (IsByVal) { + unsigned ByValAS = cast(Arg.getType())->getAddressSpace(); + + assert(VRegs[i].size() == 1 && + "expected only one register for byval pointers"); + if (ByValAS == AMDGPUAS::CONSTANT_ADDRESS) { + lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); + } else { + const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); + lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); + + B.buildAddrSpaceCast(VRegs[i][0], PtrReg); + } + } else { + ArrayRef OrigArgRegs = VRegs[i]; + Register ArgReg = + OrigArgRegs.size() == 1 + ? OrigArgRegs[0] + : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); + + lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); + if (OrigArgRegs.size() > 1) + unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + } ++i; } Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -87,11 +87,12 @@ void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); - void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, - unsigned &Offset, msgpack::ArrayDocNode Args, - unsigned PointeeAlign = 0, StringRef Name = "", - StringRef TypeName = "", StringRef BaseTypeName = "", - StringRef AccQual = "", StringRef TypeQual = ""); + void emitKernelArg(const DataLayout &DL, Type *Ty, unsigned Align, + StringRef ValueKind, unsigned &Offset, + msgpack::ArrayDocNode Args, unsigned PointeeAlign = 0, + StringRef Name = "", StringRef TypeName = "", + StringRef BaseTypeName = "", StringRef AccQual = "", + StringRef TypeQual = ""); void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, msgpack::ArrayDocNode Args); @@ -158,8 +159,8 @@ void emitKernelArg(const Argument &Arg); - void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, - unsigned PointeeAlign = 0, + void emitKernelArg(const DataLayout &DL, Type *Ty, unsigned Align, + ValueKind ValueKind, unsigned PointeeAlign = 0, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -24,6 +24,23 @@ #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static std::pair getArgumentTypeAlign(const Argument &Arg, + const DataLayout &DL) { + Type *Ty = Arg.getType(); + unsigned ArgAlign = 0; + if (Arg.hasByValAttr()) { + Ty = Arg.getParamByValType(); + ArgAlign = Arg.getParamAlignment(); + } + + if (ArgAlign == 0) + ArgAlign = DL.getABITypeAlignment(Ty); + + return std::make_pair(Ty, ArgAlign); +} + namespace llvm { static cl::opt DumpHSAMetadata( @@ -343,24 +360,29 @@ if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned PointeeAlign = 0; - if (auto PtrTy = dyn_cast(Ty)) { + if (auto PtrTy = dyn_cast(Arg.getType())) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // FIXME: Should report this for all address spaces PointeeAlign = Arg.getParamAlignment(); if (PointeeAlign == 0) PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); } } - emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName), - PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); + Type *ArgTy; + unsigned ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name, + TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, + unsigned Align, ValueKind ValueKind, unsigned PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, @@ -371,7 +393,7 @@ Arg.mName = std::string(Name); Arg.mTypeName = std::string(TypeName); Arg.mSize = DL.getTypeAllocSize(Ty); - Arg.mAlign = DL.getABITypeAlignment(Ty); + Arg.mAlign = Align; Arg.mValueKind = ValueKind; Arg.mValueType = getValueType(Ty, BaseTypeName); Arg.mPointeeAlign = PointeeAlign; @@ -408,11 +430,11 @@ auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + emitKernelArg(DL, Int64Ty, 8, ValueKind::HiddenGlobalOffsetX); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + emitKernelArg(DL, Int64Ty, 8, ValueKind::HiddenGlobalOffsetY); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + emitKernelArg(DL, Int64Ty, 8, ValueKind::HiddenGlobalOffsetZ); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -421,31 +443,31 @@ // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenPrintfBuffer); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenHostcallBuffer); } else - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenNone); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenCompletionAction); } else { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenNone); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg); + emitKernelArg(DL, Int8PtrTy, 8, ValueKind::HiddenMultiGridSyncArg); } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -765,11 +787,11 @@ if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); unsigned PointeeAlign = 0; - if (auto PtrTy = dyn_cast(Ty)) { + if (auto PtrTy = dyn_cast(Arg.getType())) { + // FIXME: Should report this for all address spaces if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { PointeeAlign = Arg.getParamAlignment(); if (PointeeAlign == 0) @@ -777,19 +799,21 @@ } } - emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), - getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset, - Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual, - TypeQual); + // There's no distinction between byval aggregates and raw aggregates. + Type *ArgTy; + unsigned ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, + PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } -void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, - StringRef ValueKind, unsigned &Offset, - msgpack::ArrayDocNode Args, - unsigned PointeeAlign, StringRef Name, - StringRef TypeName, - StringRef BaseTypeName, - StringRef AccQual, StringRef TypeQual) { +void MetadataStreamerV3::emitKernelArg( + const DataLayout &DL, Type *Ty, unsigned Align, StringRef ValueKind, + unsigned &Offset, msgpack::ArrayDocNode Args, unsigned PointeeAlign, + StringRef Name, StringRef TypeName, StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { auto Arg = Args.getDocument()->getMapNode(); if (!Name.empty()) @@ -797,7 +821,6 @@ if (!TypeName.empty()) Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); - auto Align = DL.getABITypeAlignment(Ty); Arg[".size"] = Arg.getDocument()->getNode(Size); Offset = alignTo(Offset, Align); Arg[".offset"] = Arg.getDocument()->getNode(Offset); @@ -846,11 +869,11 @@ auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args); + emitKernelArg(DL, Int64Ty, 8, "hidden_global_offset_x", Offset, Args); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args); + emitKernelArg(DL, Int64Ty, 8, "hidden_global_offset_y", Offset, Args); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args); + emitKernelArg(DL, Int64Ty, 8, "hidden_global_offset_z", Offset, Args); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -859,31 +882,31 @@ // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_printf_buffer", Offset, Args); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_hostcall_buffer", Offset, Args); } else - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_none", Offset, Args); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_default_queue", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_completion_action", Offset, Args); } else { - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_none", Offset, Args); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args); + emitKernelArg(DL, Int8PtrTy, 8, "hidden_multigrid_sync_arg", Offset, Args); } msgpack::MapDocNode Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -962,10 +962,14 @@ unsigned InIndex = 0; for (const Argument &Arg : Fn.args()) { + const bool IsByVal = Arg.hasByValAttr(); Type *BaseArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(BaseArgTy); + Type *MemArgTy = IsByVal ? Arg.getParamByValType() : BaseArgTy; + unsigned Align = IsByVal ? Arg.getParamAlignment() : 0; + if (Align == 0) + Align = DL.getABITypeAlignment(MemArgTy); MaxAlign = std::max(Align, MaxAlign); - unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + unsigned AllocSize = DL.getTypeAllocSize(MemArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; Index: llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -108,8 +108,12 @@ uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy); + const bool IsByVal = Arg.hasByValAttr(); + Type *ArgTy = IsByVal ? Arg.getParamByValType() : Arg.getType(); + unsigned ABITypeAlign = IsByVal ? Arg.getParamAlignment() : 0; + if (ABITypeAlign == 0) + ABITypeAlign = DL.getABITypeAlignment(ArgTy); + unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); @@ -119,6 +123,19 @@ if (Arg.use_empty()) continue; + // If this is byval, the loads are already explicit in the function. We just + // need to rewrite the pointer values. + if (IsByVal) { + Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".byval.kernarg.offset"); + + Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ArgOffsetPtr, Arg.getType()); + Arg.replaceAllUsesWith(CastOffsetPtr); + continue; + } + if (PointerType *PT = dyn_cast(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2153,7 +2153,7 @@ // kern arg offset. const unsigned KernelArgBaseAlign = 16; - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -2170,6 +2170,19 @@ const uint64_t Offset = VA.getLocMemOffset(); unsigned Align = MinAlign(KernelArgBaseAlign, Offset); + if (Arg.Flags.isByVal()) { + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); + + if (!isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace())) { + Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace()); + } + + InVals.push_back(Ptr); + continue; + } + SDValue Arg = lowerKernargMemParameter( DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -720,3 +720,318 @@ ; HSA-VI: S_ENDPGM 0 ret void } + +; Byval pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byval_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_constant_i8_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 1 from %ir.in.byval, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i8, i8 addrspace(4)* %in.byval + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_constant_i16_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 2 from %ir.in.byval, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i16, i16 addrspace(4)* %in.byval + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in.byval, i32 %after.offset) { + ; HSA-VI-LABEL: name: byval_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 4 from %ir.in.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { + ; HSA-VI-LABEL: name: byval_constant_v4i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 16 from %ir.in.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store 16 into %ir.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.cast, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byval + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byval_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval align 256 %in.byval, i32 %after.offset) { + ; HSA-VI-LABEL: name: byval_align_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 260 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 4 from %ir.in.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { + ; HSA-VI-LABEL: name: byval_natural_align_constant_v16i32_arg + ; HSA-VI: bb.1 (%ir-block.1): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load 1, align 8, addrspace 4) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 + ; HSA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable load 64 from %ir.in.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD3]](<16 x s32>), [[LOAD]](p1) :: (volatile store 64 into %ir.cast.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byval + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byval kernel arguments with other global address spaces. +define amdgpu_kernel void @byval_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_global_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (dereferenceable load 4 from %ir.1, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(1)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_flat_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (dereferenceable load 4 from %ir.in.byval) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_constant_32bit_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (dereferenceable load 4 from %ir.in.byval, addrspace 6) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(6)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_unknown_as_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (dereferenceable load 4 from %ir.in.byval, addrspace 999) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(999)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byval_private_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(5)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_private_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p5) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p5) :: (dereferenceable load 4 from %ir.in.byval, addrspace 5) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(5)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byval_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_local_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (dereferenceable load 4 from %ir.in.byval, addrspace 3) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(3)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in0.byval, i32 addrspace(4)* byval %in1.byval, i32 %after.offset) { + ; HSA-VI-LABEL: name: multi_byval_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable load 4 from %ir.in0.byval, addrspace 4) + ; HSA-VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable load 4 from %ir.in1.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in0 = load i32, i32 addrspace(4)* %in0.byval + %in1 = load i32, i32 addrspace(4)* %in1.byval + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i32_arg_offset0(i32 addrspace(4)* byval %in.byval) { + ; HSA-VI-LABEL: name: byval_constant_i32_arg_offset0 + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; HSA-VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable load 4 from %ir.in.byval, addrspace 4) + ; HSA-VI: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byval + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_0size_arg({} addrspace(4)* byval %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr) { + ; HSA-VI-LABEL: name: byval_constant_0size_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: G_STORE [[LOAD]](p4), %1:_(p1) :: (store 8 into %ir.out.ptr, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + store {} addrspace(4)* %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr + ret void +} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -59,6 +59,93 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byval_constant +; CHECK: .symbol: test_char_byval_constant.kd +define amdgpu_kernel void @test_char_byval_constant(i8 addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - .args: +; CHECK-NEXT: - .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 512 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 520 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 528 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 536 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 544 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byval_constant_align512 +; CHECK: .symbol: test_char_byval_constant_align512.kd +define amdgpu_kernel void @test_char_byval_constant_align512(i8, i8 addrspace(4)* byval align 512 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -623,6 +710,57 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_struct_byval_constant +; CHECK: .symbol: test_struct_byval_constant.kd +define amdgpu_kernel void @test_struct_byval_constant(%struct.A addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: .name: a ; CHECK-NEXT: .offset: 0 @@ -674,6 +812,57 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 32 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_array_byval_constant +; CHECK: .symbol: test_array_byval_constant.kd +define amdgpu_kernel void @test_array_byval_constant([32 x i8] addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -2093,5 +2282,6 @@ !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -64,6 +64,93 @@ ret void } +; CHECK: - Name: test_char_byval_constant +; CHECK-NEXT: SymbolName: 'test_char_byval_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byval_constant(i8 addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_char_byval_constant_align512 +; CHECK-NEXT: SymbolName: 'test_char_byval_constant_align512@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 512 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byval_constant_align512(i8, i8 addrspace(4)* byval align 512 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_ushort2 ; CHECK-NEXT: SymbolName: 'test_ushort2@kd' ; CHECK-NEXT: Language: OpenCL C @@ -617,6 +704,56 @@ ret void } +; CHECK: - Name: test_struct_byval_constant +; CHECK-NEXT: SymbolName: 'test_struct_byval_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_struct_byval_constant(%struct.A addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_array ; CHECK-NEXT: SymbolName: 'test_array@kd' ; CHECK-NEXT: Language: OpenCL C @@ -667,6 +804,56 @@ ret void } +; CHECK: - Name: test_array_byval_constant +; CHECK-NEXT: SymbolName: 'test_array_byval_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_array_byval_constant([8 x i8] addrspace(4)* byval %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_i128 ; CHECK-NEXT: SymbolName: 'test_i128@kd' ; CHECK-NEXT: Language: OpenCL C @@ -2064,5 +2251,5 @@ !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} - +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS Index: llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -152,3 +152,135 @@ store <3 x i15> %in, <3 x i15> addrspace(1)* %out, align 4 ret void } + +; Byval pointers should only be treated as offsets from kernarg +; GCN-LABEL: {{^}}byval_constant_i8_arg: +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byval_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byval %in.byval) { + %in = load i8, i8 addrspace(4)* %in.byval + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_i16_arg: +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byval_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byval %in.byval) { + %in = load i16, i16 addrspace(4)* %in.byval + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_i32_arg: +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +; GCN: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0xc{{$}} +define amdgpu_kernel void @byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in.byval, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_v4i32_arg: +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} +define amdgpu_kernel void @byval_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byval + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_align_constant_i32_arg: +; GCN-DAG: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x100{{$}} +; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} +; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] +; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]] +define amdgpu_kernel void @byval_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval align 256 %in.byval, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_natural_align_constant_v16i32_arg: +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +define amdgpu_kernel void @byval_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byval + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byval kernel arguments with other global address spaces. +; GCN-LABEL: {{^}}byval_global_i32_arg: +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +define amdgpu_kernel void @byval_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byval %in.byval) { + %in = load i32, i32 addrspace(1)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_flat_i32_arg: +; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} +define amdgpu_kernel void @byval_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byval %in.byval) { + %in = load i32, i32* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_32bit_i32_arg: +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 +; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}} +define amdgpu_kernel void @byval_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byval %in.byval) { + %in = load i32, i32 addrspace(6)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; define amdgpu_kernel void @byval_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byval %in.byval) { +; %in = load i32, i32 addrspace(999)* %in.byval +; store i32 %in, i32 addrspace(1)* %out, align 4 +; ret void +; } + +; GCN-LABEL: {{^}}multi_byval_constant_i32_arg: +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x8 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0xc +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x10 +define amdgpu_kernel void @multi_byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in0.byval, i32 addrspace(4)* byval %in1.byval, i32 %after.offset) { + %in0 = load i32, i32 addrspace(4)* %in0.byval + %in1 = load i32, i32 addrspace(4)* %in1.byval + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_i32_arg_offset0: +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} +define amdgpu_kernel void @byval_constant_i32_arg_offset0(i32 addrspace(4)* byval %in.byval) { + %in = load i32, i32 addrspace(4)* %in.byval + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} + +; GCN-LABEL: {{^}}byval_constant_0size_arg: +; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0{{$}} +define amdgpu_kernel void @byval_constant_0size_arg({} addrspace(4)* byval %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr) { + store {} addrspace(4)* %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr + ret void +} Index: llvm/test/CodeGen/AMDGPU/lower-kernargs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1486,6 +1486,486 @@ ret void } +; Byval pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byval_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byval %in.byval) { +; HSA-LABEL: @byval_constant_i8_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] +; HSA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_i8_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] +; MESA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i8, i8 addrspace(4)* %in.byval + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byval %in.byval) { +; HSA-LABEL: @byval_constant_i16_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]] +; HSA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_i16_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]] +; MESA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i16, i16 addrspace(4)* %in.byval + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in.byval, i32 %after.offset) { +; HSA-LABEL: @byval_constant_i32_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_i32_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { +; HSA-LABEL: @byval_constant_v4i32_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]] +; HSA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_v4i32_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]] +; MESA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; MESA-NEXT: ret void +; + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byval + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byval_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval align 256 %in.byval, i32 %after.offset) { +; HSA-LABEL: @byval_align_constant_i32_arg( +; HSA-NEXT: [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_align_constant_i32_arg( +; MESA-NEXT: [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byval + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byval %in.byval, i32 %after.offset) { +; HSA-LABEL: @byval_natural_align_constant_v16i32_arg( +; HSA-NEXT: [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(28) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]] +; HSA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; HSA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_natural_align_constant_v16i32_arg( +; MESA-NEXT: [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]] +; MESA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; MESA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byval + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byval kernel arguments with other global address spaces. +define amdgpu_kernel void @byval_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byval %in.byval) { +; HSA-LABEL: @byval_global_i32_arg( +; HSA-NEXT: [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_global_i32_arg( +; MESA-NEXT: [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(1)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byval %in.byval) { +; HSA-LABEL: @byval_flat_i32_arg( +; HSA-NEXT: [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32* +; HSA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_flat_i32_arg( +; MESA-NEXT: [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32* +; MESA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byval %in.byval) { +; HSA-LABEL: @byval_constant_32bit_i32_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_32bit_i32_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(6)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byval %in.byval) { +; HSA-LABEL: @byval_unknown_as_i32_arg( +; HSA-NEXT: [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_unknown_as_i32_arg( +; MESA-NEXT: [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(999)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byval_private_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(5)* byval %in.byval) { +; HSA-LABEL: @byval_private_i32_arg( +; HSA-NEXT: [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(5)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(5)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_private_i32_arg( +; MESA-NEXT: [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(5)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(5)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(5)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byval_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byval %in.byval) { +; HSA-LABEL: @byval_local_i32_arg( +; HSA-NEXT: [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_local_i32_arg( +; MESA-NEXT: [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(3)* %in.byval + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byval_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byval %in0.byval, i32 addrspace(4)* byval %in1.byval, i32 %after.offset) { +; HSA-LABEL: @multi_byval_constant_i32_arg( +; HSA-NEXT: [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(28) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN1_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; HSA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]] +; HSA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @multi_byval_constant_i32_arg( +; MESA-NEXT: [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN1_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYVAL_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; MESA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]] +; MESA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in0 = load i32, i32 addrspace(4)* %in0.byval + %in1 = load i32, i32 addrspace(4)* %in1.byval + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_i32_arg_offset0(i32 addrspace(4)* byval %in.byval) { +; HSA-LABEL: @byval_constant_i32_arg_offset0( +; HSA-NEXT: [[BYVAL_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_i32_arg_offset0( +; MESA-NEXT: [[BYVAL_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]] +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byval + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} + +define amdgpu_kernel void @byval_constant_0size_arg({} addrspace(4)* byval %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr) { +; HSA-LABEL: @byval_constant_0size_arg( +; HSA-NEXT: [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to {} addrspace(4)* +; HSA-NEXT: [[OUT_PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_PTR_KERNARG_OFFSET]] to {} addrspace(4)* addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_PTR_LOAD:%.*]] = load {} addrspace(4)* addrspace(1)*, {} addrspace(4)* addrspace(1)* addrspace(4)* [[OUT_PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: store {} addrspace(4)* [[TMP1]], {} addrspace(4)* addrspace(1)* [[OUT_PTR_LOAD]] +; HSA-NEXT: ret void +; +; MESA-LABEL: @byval_constant_0size_arg( +; MESA-NEXT: [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[IN_BYVAL_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYVAL_BYVAL_KERNARG_OFFSET]] to {} addrspace(4)* +; MESA-NEXT: [[OUT_PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYVAL_CONSTANT_0SIZE_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_PTR_KERNARG_OFFSET]] to {} addrspace(4)* addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_PTR_LOAD:%.*]] = load {} addrspace(4)* addrspace(1)*, {} addrspace(4)* addrspace(1)* addrspace(4)* [[OUT_PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: store {} addrspace(4)* [[TMP1]], {} addrspace(4)* addrspace(1)* [[OUT_PTR_LOAD]] +; MESA-NEXT: ret void +; + store {} addrspace(4)* %in.byval, {} addrspace(4)* addrspace(1)* %out.ptr + ret void +} + attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" } attributes #2 = { nounwind "target-cpu"="tahiti" }