Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -23,8 +23,8 @@ class MachineInstrBuilder; class AMDGPUCallLowering: public CallLowering { - Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy, - uint64_t Offset) const; + void lowerParameterPtr(Register DstReg, MachineIRBuilder &B, Type *ParamTy, + uint64_t Offset) const; void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset, Align Alignment, Register DstReg) const; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -385,24 +385,19 @@ return true; } -Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, - Type *ParamTy, - uint64_t Offset) const { - +void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, + Type *ParamTy, + uint64_t Offset) const { MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - const DataLayout &DL = F.getParent()->getDataLayout(); - PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); - LLT PtrType = getLLTForType(*PtrTy, DL); Register KernArgSegmentPtr = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); - return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); + B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); } void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, @@ -413,7 +408,10 @@ const DataLayout &DL = F.getParent()->getDataLayout(); MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); + + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + lowerParameterPtr(PtrReg, B, ParamTy, Offset); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, @@ -500,12 +498,15 @@ // TODO: Align down to dword alignment and extract bits for extending loads. for (auto &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); if (AllocSize == 0) continue; - Align ABIAlign = DL.getABITypeAlign(ArgTy); + MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; + if (!ABIAlign) + ABIAlign = DL.getABITypeAlign(ArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; @@ -515,16 +516,34 @@ continue; } - ArrayRef OrigArgRegs = VRegs[i]; - Register ArgReg = - OrigArgRegs.size() == 1 - ? OrigArgRegs[0] - : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); - Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); - lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); - if (OrigArgRegs.size() > 1) - unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + + if (IsByRef) { + unsigned ByRefAS = cast(Arg.getType())->getAddressSpace(); + + assert(VRegs[i].size() == 1 && + "expected only one register for byval pointers"); + if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { + lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); + } else { + const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); + lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); + + B.buildAddrSpaceCast(VRegs[i][0], PtrReg); + } + } else { + ArrayRef OrigArgRegs = VRegs[i]; + Register ArgReg = + OrigArgRegs.size() == 1 + ? OrigArgRegs[0] + : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); + + lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); + if (OrigArgRegs.size() > 1) + unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); + } + ++i; } Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -89,7 +89,8 @@ void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); - void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, + void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, + StringRef ValueKind, unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign = None, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", @@ -160,7 +161,7 @@ void emitKernelArg(const Argument &Arg); - void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, + void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment, ValueKind ValueKind, MaybeAlign PointeeAlign = None, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -24,6 +24,23 @@ #include "llvm/IR/Module.h" #include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static std::pair getArgumentTypeAlign(const Argument &Arg, + const DataLayout &DL) { + Type *Ty = Arg.getType(); + MaybeAlign ArgAlign; + if (Arg.hasByRefAttr()) { + Ty = Arg.getParamByRefType(); + ArgAlign = Arg.getParamAlign(); + } + + if (!ArgAlign) + ArgAlign = DL.getABITypeAlign(Ty); + + return std::make_pair(Ty, *ArgAlign); +} + namespace llvm { static cl::opt DumpHSAMetadata( @@ -343,23 +360,28 @@ if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); MaybeAlign PointeeAlign; - if (auto PtrTy = dyn_cast(Ty)) { + if (auto PtrTy = dyn_cast(Arg.getType())) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // FIXME: Should report this for all address spaces PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), PtrTy->getElementType()); } } - emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName), - PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); + Type *ArgTy; + Align ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name, + TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, + Align Alignment, ValueKind ValueKind, MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, @@ -370,7 +392,7 @@ Arg.mName = std::string(Name); Arg.mTypeName = std::string(TypeName); Arg.mSize = DL.getTypeAllocSize(Ty); - Arg.mAlign = DL.getABITypeAlign(Ty).value(); + Arg.mAlign = Alignment.value(); Arg.mValueKind = ValueKind; Arg.mValueType = getValueType(Ty, BaseTypeName); Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0; @@ -407,11 +429,11 @@ auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -420,31 +442,31 @@ // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenHostcallBuffer); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer); } else - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction); } else { - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenMultiGridSyncArg); + emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg); } bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { @@ -764,10 +786,12 @@ if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast(Node->getOperand(ArgNo))->getString(); - Type *Ty = Arg.getType(); const DataLayout &DL = Func->getParent()->getDataLayout(); MaybeAlign PointeeAlign; + Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType(); + + // FIXME: Need to distinguish in memory alignment from pointer alignment. if (auto PtrTy = dyn_cast(Ty)) { if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(), @@ -775,15 +799,19 @@ } } - emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), - getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset, - Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual, - TypeQual); + // There's no distinction between byval aggregates and raw aggregates. + Type *ArgTy; + Align ArgAlign; + std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL); + + emitKernelArg(DL, ArgTy, ArgAlign, + getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args, + PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, - StringRef ValueKind, unsigned &Offset, - msgpack::ArrayDocNode Args, + Align Alignment, StringRef ValueKind, + unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName, StringRef BaseTypeName, @@ -795,7 +823,6 @@ if (!TypeName.empty()) Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true); auto Size = DL.getTypeAllocSize(Ty); - Align Alignment = DL.getABITypeAlign(Ty); Arg[".size"] = Arg.getDocument()->getNode(Size); Offset = alignTo(Offset, Alignment); Arg[".offset"] = Arg.getDocument()->getNode(Offset); @@ -844,11 +871,11 @@ auto Int64Ty = Type::getInt64Ty(Func.getContext()); if (HiddenArgNumBytes >= 8) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args); if (HiddenArgNumBytes >= 16) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args); if (HiddenArgNumBytes >= 24) - emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args); + emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); @@ -857,31 +884,31 @@ // "none" argument. if (HiddenArgNumBytes >= 32) { if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset, Args); else if (Func.getParent()->getFunction("__ockl_hostcall_internal")) { // The printf runtime binding pass should have ensured that hostcall and // printf are not used in the same module. assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts")); - emitKernelArg(DL, Int8PtrTy, "hidden_hostcall_buffer", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset, Args); } else - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } // Emit "default queue" and "completion action" arguments if enqueue kernel is // used, otherwise emit dummy "none" arguments. if (HiddenArgNumBytes >= 48) { if (Func.hasFnAttribute("calls-enqueue-kernel")) { - emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); } else { - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); - emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args); } } // Emit the pointer argument for multi-grid object. if (HiddenArgNumBytes >= 56) - emitKernelArg(DL, Int8PtrTy, "hidden_multigrid_sync_arg", Offset, Args); + emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset, Args); } msgpack::MapDocNode Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1017,10 +1017,14 @@ unsigned InIndex = 0; for (const Argument &Arg : Fn.args()) { + const bool IsByRef = Arg.hasByRefAttr(); Type *BaseArgTy = Arg.getType(); - Align Alignment = DL.getABITypeAlign(BaseArgTy); - MaxAlign = std::max(Alignment, MaxAlign); - unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(MemArgTy); + MaxAlign = max(Alignment, MaxAlign); + uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; Index: llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -108,10 +108,14 @@ uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - Align ABITypeAlign = DL.getABITypeAlign(ArgTy); - unsigned Size = DL.getTypeSizeInBits(ArgTy); - unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign ABITypeAlign = IsByRef ? Arg.getParamAlign() : None; + if (!ABITypeAlign) + ABITypeAlign = DL.getABITypeAlign(ArgTy); + + uint64_t Size = DL.getTypeSizeInBits(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; @@ -119,6 +123,19 @@ if (Arg.use_empty()) continue; + // If this is byval, the loads are already explicit in the function. We just + // need to rewrite the pointer values. + if (IsByRef) { + Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( + Builder.getInt8Ty(), KernArgSegment, EltOffset, + Arg.getName() + ".byval.kernarg.offset"); + + Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + ArgOffsetPtr, Arg.getType()); + Arg.replaceAllUsesWith(CastOffsetPtr); + continue; + } + if (PointerType *PT = dyn_cast(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -513,12 +513,15 @@ MaxAlign = Align(1); for (const Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); + const bool IsByRef = Arg.hasByRefAttr(); + Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(ArgTy); - const Align Alignment = DL.getABITypeAlign(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; - MaxAlign = std::max(MaxAlign, Alignment); + MaxAlign = max(MaxAlign, Alignment); } return ExplicitArgBytes; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2253,9 +2253,21 @@ const uint64_t Offset = VA.getLocMemOffset(); Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = - lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, - Ins[i].Flags.isSExt(), &Ins[i]); + if (Arg.Flags.isByRef()) { + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); + + if (!isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace())) { + Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace()); + } + + InVals.push_back(Ptr); + continue; + } + + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = Index: llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -1265,3 +1265,500 @@ entry: ret void } + +; Byref pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i8_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (load 1 from %ir.in.byref, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD1]](p4) :: (load 1 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8) + ; LEGACY-MESA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) align 2 %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i16_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 from %ir.in.byref, addrspace 4) + ; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16) + ; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD1]](p4) :: (load 2 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16) + ; LEGACY-MESA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align 4 %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) align(16) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_constant_v4i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store 16 into %ir.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.cast, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 52 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store 16 into %ir.out, align 4, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.cast, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_align_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 260 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 292 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 296 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 8, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg + ; HSA-VI: bb.1 (%ir-block.1): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 128 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 64 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store 64 into %ir.cast.out, align 4, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.1): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 100 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 164 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 64 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store 64 into %ir.cast.out, align 4, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_global_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.1, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.1, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_flat_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (load 4 from %ir.in.byref) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p0) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (load 4 from %ir.in.byref) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_32bit_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (load 4 from %ir.in.byref, addrspace 6) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p6) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (load 4 from %ir.in.byref, addrspace 6) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_unknown_as_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (load 4 from %ir.in.byref, addrspace 999) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p999) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (load 4 from %ir.in.byref, addrspace 999) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(999)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_private_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(5)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_private_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p5) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p5) :: (load 4 from %ir.in.byref, addrspace 5) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_private_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p5) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p5) :: (load 4 from %ir.in.byref, addrspace 5) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(5)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_local_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (load 4 from %ir.in.byref, addrspace 3) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[ADDRSPACE_CAST:%[0-9]+]]:_(p3) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (load 4 from %ir.in.byref, addrspace 3) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(3)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(4) %in0.byref, i32 addrspace(4)* byref(i32) align(4) %in1.byref, i32 %after.offset) { + ; HSA-VI-LABEL: name: multi_byref_constant_i32_arg + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 16, addrspace 4) + ; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; HSA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 + ; HSA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; HSA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; HSA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; HSA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in0.byref, addrspace 4) + ; HSA-VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 4 from %ir.in1.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; LEGACY-MESA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 + ; LEGACY-MESA-VI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; LEGACY-MESA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; LEGACY-MESA-VI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C2]](s64) + ; LEGACY-MESA-VI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 52 + ; LEGACY-MESA-VI: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; LEGACY-MESA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p4) :: (dereferenceable invariant load 4, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 4 from %ir.in0.byref, addrspace 4) + ; LEGACY-MESA-VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 4 from %ir.in1.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) align(4) %in.byref) { + ; HSA-VI-LABEL: name: byref_constant_i32_arg_offset0 + ; HSA-VI: bb.1 (%ir-block.0): + ; HSA-VI: liveins: $sgpr4_sgpr5 + ; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5 + ; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; HSA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; HSA-VI: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; HSA-VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; HSA-VI: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; HSA-VI: S_ENDPGM 0 + ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0 + ; LEGACY-MESA-VI: bb.1 (%ir-block.0): + ; LEGACY-MESA-VI: liveins: $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; LEGACY-MESA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36 + ; LEGACY-MESA-VI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C]](s64) + ; LEGACY-MESA-VI: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; LEGACY-MESA-VI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 4 from %ir.in.byref, addrspace 4) + ; LEGACY-MESA-VI: G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; LEGACY-MESA-VI: S_ENDPGM 0 + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -59,6 +59,93 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byref_constant +; CHECK: .symbol: test_char_byref_constant.kd +define amdgpu_kernel void @test_char_byref_constant(i8 addrspace(4)* byref(i8) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - .args: +; CHECK-NEXT: - .offset: 0 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .name: a +; CHECK-NEXT: .offset: 512 +; CHECK-NEXT: .size: 1 +; CHECK-NEXT: .type_name: char +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .offset: 520 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 528 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 536 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 544 +; CHECK-NEXT: .size: 8 +; CHECK-NOT: .value_kind: hidden_default_queue +; CHECK-NOT: .value_kind: hidden_completion_action +; CHECK-NOT: .value_kind: hidden_hostcall_buffer +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_char_byref_constant_align512 +; CHECK: .symbol: test_char_byref_constant_align512.kd +define amdgpu_kernel void @test_char_byref_constant_align512(i8, i8 addrspace(4)* byref(i8) align(512) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -623,6 +710,57 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: - .offset: 8 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 16 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 24 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_struct_byref_constant +; CHECK: .symbol: test_struct_byref_constant.kd +define amdgpu_kernel void @test_struct_byref_constant(%struct.A addrspace(4)* byref(%struct.A) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: .name: a ; CHECK-NEXT: .offset: 0 @@ -674,6 +812,57 @@ ret void } +; CHECK: - .args: +; CHECK-NEXT: .name: a +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .size: 32 +; CHECK-NEXT: .type_name: struct A +; CHECK-NEXT: .value_kind: by_value +; CHECK-NEXT: .value_type: struct +; CHECK-NEXT: - .offset: 32 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_x +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 40 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_y +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .offset: 48 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_global_offset_z +; CHECK-NEXT: .value_type: i64 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 56 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_printf_buffer +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 64 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 72 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_none +; CHECK-NEXT: .value_type: i8 +; CHECK-NEXT: - .address_space: global +; CHECK-NEXT: .offset: 80 +; CHECK-NEXT: .size: 8 +; CHECK-NEXT: .value_kind: hidden_multigrid_sync_arg +; CHECK-NEXT: .value_type: i8 +; CHECK: .language: OpenCL C +; CHECK-NEXT: .language_version: +; CHECK-NEXT: - 2 +; CHECK-NEXT: - 0 +; CHECK: .name: test_array_byref_constant +; CHECK: .symbol: test_array_byref_constant.kd +define amdgpu_kernel void @test_array_byref_constant([32 x i8] addrspace(4)* byref([32 x i8]) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - .args: ; CHECK-NEXT: - .name: a ; CHECK-NEXT: .offset: 0 @@ -2093,5 +2282,6 @@ !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -64,6 +64,93 @@ ret void } +; CHECK: - Name: test_char_byref_constant +; CHECK-NEXT: SymbolName: 'test_char_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byref_constant(i8 addrspace(4)* byref(i8) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + +; CHECK: - Name: test_char_byref_constant_align512 +; CHECK-NEXT: SymbolName: 'test_char_byref_constant_align512@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: char +; CHECK-NEXT: Size: 1 +; CHECK-NEXT: Align: 512 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NOT: ValueKind: HiddenHostcallBuffer +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NOT: ValueKind: HiddenDefaultQueue +; CHECK-NOT: ValueKind: HiddenCompletionAction +; CHECK: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_char_byref_constant_align512(i8, i8 addrspace(4)* byref(i8) align 512 %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !111 + !kernel_arg_base_type !9 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_ushort2 ; CHECK-NEXT: SymbolName: 'test_ushort2@kd' ; CHECK-NEXT: Language: OpenCL C @@ -617,6 +704,56 @@ ret void } +; CHECK: - Name: test_struct_byref_constant +; CHECK-NEXT: SymbolName: 'test_struct_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 4 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_struct_byref_constant(%struct.A addrspace(4)* byref(%struct.A) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_array ; CHECK-NEXT: SymbolName: 'test_array@kd' ; CHECK-NEXT: Language: OpenCL C @@ -667,6 +804,56 @@ ret void } +; CHECK: - Name: test_array_byref_constant +; CHECK-NEXT: SymbolName: 'test_array_byref_constant@kd' +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Args: +; CHECK-NEXT: - Name: a +; CHECK-NEXT: TypeName: struct A +; CHECK-NEXT: Size: 8 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenNone +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenMultiGridSyncArg +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @test_array_byref_constant([8 x i8] addrspace(4)* byref([8 x i8]) %a) #0 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 + !kernel_arg_base_type !20 !kernel_arg_type_qual !4 { + ret void +} + ; CHECK: - Name: test_i128 ; CHECK-NEXT: SymbolName: 'test_i128@kd' ; CHECK-NEXT: Language: OpenCL C @@ -2064,5 +2251,5 @@ !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} !110 = !{!"__block_literal"} - +!111 = !{!"char", !"char"} ; PARSER: AMDGPU HSA Metadata Parser Test: PASS Index: llvm/test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -920,3 +920,25 @@ store volatile i8 %val, i8 addrspace(1)* undef ret void } + +; GCN-LABEL: {{^}}byref_align_constant_i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 264 +; HSA-GFX9-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x100{{$}} +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: +; HSA-GFX9: kernarg_segment_byte_size = 132 +; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; HSA-GFX9-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -152,3 +152,137 @@ store <3 x i15> %in, <3 x i15> addrspace(1)* %out, align 4 ret void } + +; Byref pointers should only be treated as offsets from kernarg +; GCN-LABEL: {{^}}byref_constant_i8_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i16_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5 +; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8 +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 16 +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +; GCN: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0xc{{$}} +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_v4i32_arg: +; GCN: kernarg_segment_byte_size = 36 +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) { + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_align_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 264 +; GCN-DAG: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x100{{$}} +; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} +; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] +; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]] +; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]] +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: +; GCN: kernarg_segment_byte_size = 132 +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 +; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +; GCN-LABEL: {{^}}byref_global_i32_arg: +; GCN: kernarg_segment_byte_size = 12 +; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_flat_i32_arg: +; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) { + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: +; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 +; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}} +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref %in.byref) { +; %in = load i32, i32 addrspace(999)* %in.byref +; store i32 %in, i32 addrspace(1)* %out, align 4 +; ret void +; } + +; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: +; GCN: kernarg_segment_byte_size = 20 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x8 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0xc +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x10 +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: +; GCN: kernarg_segment_byte_size = 4 +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) { + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} Index: llvm/test/CodeGen/AMDGPU/lower-kernargs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1486,6 +1486,461 @@ ret void } +; Byref pointers should only be treated as offsets from kernarg +define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { +; HSA-LABEL: @byref_constant_i8_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1 +; HSA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i8_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1 +; MESA-NEXT: [[EXT:%.*]] = zext i8 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i8, i8 addrspace(4)* %in.byref + %ext = zext i8 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { +; HSA-LABEL: @byref_constant_i16_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2 +; HSA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i16_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2 +; MESA-NEXT: [[EXT:%.*]] = zext i16 [[IN]] to i32 +; MESA-NEXT: store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i16, i16 addrspace(4)* %in.byref + %ext = zext i16 %in to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_constant_i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_constant_v4i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16 +; HSA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_v4i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16 +; MESA-NEXT: store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)* +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4 +; MESA-NEXT: ret void +; + %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref + store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* + store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 + ret void +} + +define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_align_constant_i32_arg( +; HSA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_align_constant_i32_arg( +; MESA-NEXT: [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(300) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store volatile i32 %in, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { +; HSA-LABEL: @byref_natural_align_constant_v16i32_arg( +; HSA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(132) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64 +; HSA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; HSA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_natural_align_constant_v16i32_arg( +; MESA-NEXT: [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(168) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64 +; MESA-NEXT: [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)* +; MESA-NEXT: store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref + %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* + store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +; Also accept byref kernel arguments with other global address spaces. +define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_global_i32_arg( +; HSA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_global_i32_arg( +; MESA-NEXT: [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(1)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) { +; HSA-LABEL: @byref_flat_i32_arg( +; HSA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32* +; HSA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_flat_i32_arg( +; MESA-NEXT: [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32* +; MESA-NEXT: [[IN:%.*]] = load i32, i32* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_constant_32bit_i32_arg( +; HSA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_32bit_i32_arg( +; MESA-NEXT: [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(6)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_unknown_as_i32_arg( +; HSA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_unknown_as_i32_arg( +; MESA-NEXT: [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(999)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_private_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(5)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_private_i32_arg( +; HSA-NEXT: [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(5)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(5)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_private_i32_arg( +; MESA-NEXT: [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_PRIVATE_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(5)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(5)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(5)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; Invalid, but should not crash. +define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_local_i32_arg( +; HSA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_local_i32_arg( +; MESA-NEXT: [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(3)* %in.byref + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { +; HSA-LABEL: @multi_byref_constant_i32_arg( +; HSA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; HSA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12 +; HSA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16 +; HSA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4 +; HSA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @multi_byref_constant_i32_arg( +; MESA-NEXT: [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)* +; MESA-NEXT: [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48 +; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52 +; MESA-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4 +; MESA-NEXT: store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4 +; MESA-NEXT: ret void +; + %in0 = load i32, i32 addrspace(4)* %in0.byref + %in1 = load i32, i32 addrspace(4)* %in1.byref + store volatile i32 %in0, i32 addrspace(1)* %out, align 4 + store volatile i32 %in1, i32 addrspace(1)* %out, align 4 + store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) { +; HSA-LABEL: @byref_constant_i32_arg_offset0( +; HSA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; HSA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; HSA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; HSA-NEXT: ret void +; +; MESA-LABEL: @byref_constant_i32_arg_offset0( +; MESA-NEXT: [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)* +; MESA-NEXT: [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4 +; MESA-NEXT: store i32 [[IN]], i32 addrspace(1)* undef, align 4 +; MESA-NEXT: ret void +; + %in = load i32, i32 addrspace(4)* %in.byref + store i32 %in, i32 addrspace(1)* undef, align 4 + ret void +} + attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" } attributes #2 = { nounwind "target-cpu"="tahiti" }