diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -904,7 +904,7 @@ *this, TT, *HostTC, C.getInputArgs()); else if (TT.isAMDGCN()) DeviceTC = std::make_unique( - *this, TT, *HostTC, C.getInputArgs()); + *this, TT, *HostTC, C.getInputArgs(), C.getArgs()); else assert(DeviceTC && "Device toolchain not defined."); } diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -950,6 +950,7 @@ StringRef LibDeviceFile = RocmInstallation.getLibDeviceFile(CanonArch); auto ABIVer = DeviceLibABIVersion::fromCodeObjectVersion( getAMDGPUCodeObjectVersion(getDriver(), DriverArgs)); + if (!RocmInstallation.checkCommonBitcodeLibs(CanonArch, LibDeviceFile, ABIVer)) return {}; diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h @@ -26,8 +26,8 @@ : public ROCMToolChain { public: AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple &Triple, - const ToolChain &HostTC, - const llvm::opt::ArgList &Args); + const ToolChain &HostTC, const llvm::opt::ArgList &Args, + const llvm::opt::DerivedArgList &DerivedArgs); const llvm::Triple *getAuxTriple() const override { return &HostTC.getTriple(); @@ -58,6 +58,7 @@ getDeviceLibs(const llvm::opt::ArgList &Args) const override; const ToolChain &HostTC; + const llvm::opt::DerivedArgList &DerivedArgs; }; } // end namespace toolchains diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -47,8 +47,9 @@ AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple &Triple, const ToolChain &HostTC, - const ArgList &Args) - : ROCMToolChain(D, Triple, Args), HostTC(HostTC) { + const ArgList &Args, + const DerivedArgList &DerivedArgs) + : ROCMToolChain(D, Triple, Args), HostTC(HostTC), DerivedArgs(DerivedArgs) { // Lookup binaries into the driver directory, this is used to // discover the clang-offload-bundler executable. getProgramPaths().push_back(getDriver().Dir); @@ -179,7 +180,7 @@ getTriple(), Args.getLastArgValue(options::OPT_march_EQ)); SmallVector BCLibs; - for (auto BCLib : getCommonDeviceLibNames(Args, GpuArch.str(), + for (auto BCLib : getCommonDeviceLibNames(DerivedArgs, GpuArch.str(), /*IsOpenMP=*/true)) BCLibs.emplace_back(BCLib); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7082,7 +7082,8 @@ } if (Triple.isAMDGPU()) { - handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs); + handleAMDGPUCodeObjectVersionOptions(D, C.getArgs(), CmdArgs, + /*IsCC1As=*/true); Args.addOptInFlag(CmdArgs, options::OPT_munsafe_fp_atomics, options::OPT_mno_unsafe_fp_atomics); @@ -8048,7 +8049,8 @@ } if (Triple.isAMDGPU()) - handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs, /*IsCC1As=*/true); + handleAMDGPUCodeObjectVersionOptions(D, C.getArgs(), CmdArgs, + /*IsCC1As=*/true); assert(Input.isFilename() && "Invalid input."); CmdArgs.push_back(Input.getFilename()); diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h --- a/openmp/libomptarget/DeviceRTL/include/Interface.h +++ b/openmp/libomptarget/DeviceRTL/include/Interface.h @@ -165,6 +165,11 @@ double omp_get_wtime(void); ///} + +#ifdef __AMDGCN__ +size_t external_get_local_size(uint32_t dim); +size_t external_get_num_groups(uint32_t dim); +#endif } extern "C" { diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp --- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp @@ -16,7 +16,7 @@ #include "Utils.h" #pragma omp begin declare target device_type(nohost) - +extern const uint16_t __oclc_ABI_version; #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace _OMP; @@ -47,9 +47,7 @@ return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); } -uint32_t getNumHardwareThreadsInBlock() { - return __builtin_amdgcn_workgroup_size_x(); -} +uint32_t getNumHardwareThreadsInBlock() { return external_get_local_size(0); } LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } @@ -79,9 +77,7 @@ uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } -uint32_t getNumberOfBlocks() { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); -} +uint32_t getNumberOfBlocks() { return external_get_num_groups(0); } uint32_t getWarpId() { return impl::getThreadIdInBlock() / mapping::getWarpSize(); diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp --- a/openmp/libomptarget/DeviceRTL/src/State.cpp +++ b/openmp/libomptarget/DeviceRTL/src/State.cpp @@ -54,6 +54,9 @@ ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) +extern "C" size_t __ockl_get_local_size(uint32_t dim); +extern "C" size_t __ockl_get_num_groups(uint32_t dim); + extern "C" { void *malloc(uint64_t Size) { // TODO: Use some preallocated space for dynamic malloc. @@ -66,6 +69,17 @@ #pragma omp end declare variant ///} +extern "C" { +#ifdef __AMDGCN__ +size_t external_get_local_size(uint32_t dim) { + return __ockl_get_local_size(dim); +} +size_t external_get_num_groups(uint32_t dim) { + return __ockl_get_num_groups(dim); +} +#endif +} // extern "C" + /// A "smart" stack in shared memory. /// /// The stack exposes a malloc/free interface but works like a stack internally. diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h @@ -12,4 +12,49 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags); +enum IMPLICITARGS : uint16_t { + COV4_SIZE = 56, + COV4_HOSTCALL_PTR_OFFSET = 24, + HOSTCALL_PTR_SIZE = 8, + + COV5_SIZE = 256, + + COV5_BLOCK_COUNT_X_OFFSET = 0, + COV5_BLOCK_COUNT_X_SIZE = 4, + + COV5_BLOCK_COUNT_Y_OFFSET = 4, + COV5_BLOCK_COUNT_Y_SIZE = 4, + + COV5_BLOCK_COUNT_Z_OFFSET = 8, + COV5_BLOCK_COUNT_Z_SIZE = 4, + + COV5_GROUP_SIZE_X_OFFSET = 12, + COV5_GROUP_SIZE_X_SIZE = 2, + + COV5_GROUP_SIZE_Y_OFFSET = 14, + COV5_GROUP_SIZE_Y_SIZE = 2, + + COV5_GROUP_SIZE_Z_OFFSET = 16, + COV5_GROUP_SIZE_Z_SIZE = 2, + + COV5_REMAINDER_X_OFFSET = 18, + COV5_REMAINDER_X_SIZE = 2, + + COV5_REMAINDER_Y_OFFSET = 20, + COV5_REMAINDER_Y_SIZE = 2, + + COV5_REMAINDER_Z_OFFSET = 22, + COV5_REMAINDER_Z_SIZE = 2, + + COV5_GRID_DIMS_OFFSET = 64, + COV5_GRID_DIMS_SIZE = 2, + + COV5_HOSTCALL_PTR_OFFSET = 80, + + COV5_HEAPV1_PTR_OFFSET = 96, + COV5_HEAPV1_PTR_SIZE = 8 +}; + +const uint16_t implicitArgsSize(uint16_t Version); + #endif diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp @@ -11,6 +11,7 @@ // identifier) and contains more up to date values for the enum checked here. // rtl.cpp uses the system elf.h. #include "llvm/BinaryFormat/ELF.h" +using namespace llvm::ELF; const char *get_elf_mach_gfx_name(uint32_t EFlags) { using namespace llvm::ELF; @@ -78,3 +79,8 @@ return "--unknown gfx"; } } + +const uint16_t implicitArgsSize(uint16_t Version) { + return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE + : IMPLICITARGS::COV5_SIZE; +} diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -33,17 +33,6 @@ #define MAX_NUM_KERNELS (1024 * 16) -typedef struct impl_implicit_args_s { - uint64_t offset_x; - uint64_t offset_y; - uint64_t offset_z; - uint64_t hostcall_ptr; - uint64_t unused0; - uint64_t unused1; - uint64_t unused2; -} impl_implicit_args_t; -static_assert(sizeof(impl_implicit_args_t) == 56, ""); - // ---------------------- Kernel Start ------------- typedef struct atl_kernel_info_s { uint64_t kernel_object; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -67,6 +67,17 @@ HiddenMultiGridSyncArg, HiddenHostcallBuffer, HiddenHeapV1, + HiddenBlockCountX, + HiddenBlockCountY, + HiddenBlockCountZ, + HiddenGroupSizeX, + HiddenGroupSizeY, + HiddenGroupSizeZ, + HiddenRemainderX, + HiddenRemainderY, + HiddenRemainderZ, + HiddenGridDims, + HiddenQueuePtr, Unknown }; @@ -102,7 +113,19 @@ {"hidden_multigrid_sync_arg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, - {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}}; + {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}, + {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX}, + {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY}, + {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ}, + {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX}, + {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY}, + {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ}, + {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX}, + {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY}, + {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ}, + {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims}, + {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr}, +}; namespace core { @@ -164,6 +187,17 @@ case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: case KernelArgMD::ValueKind::HiddenHostcallBuffer: case KernelArgMD::ValueKind::HiddenHeapV1: + case KernelArgMD::ValueKind::HiddenBlockCountX: + case KernelArgMD::ValueKind::HiddenBlockCountY: + case KernelArgMD::ValueKind::HiddenBlockCountZ: + case KernelArgMD::ValueKind::HiddenGroupSizeX: + case KernelArgMD::ValueKind::HiddenGroupSizeY: + case KernelArgMD::ValueKind::HiddenGroupSizeZ: + case KernelArgMD::ValueKind::HiddenRemainderX: + case KernelArgMD::ValueKind::HiddenRemainderY: + case KernelArgMD::ValueKind::HiddenRemainderZ: + case KernelArgMD::ValueKind::HiddenGridDims: + case KernelArgMD::ValueKind::HiddenQueuePtr: return true; default: return false; @@ -473,8 +507,6 @@ size_t new_offset = lcArg.offset_; size_t padding = new_offset - offset; offset = new_offset; - DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_, - lcArg.offset_); offset += lcArg.size_; // check if the arg is a hidden/implicit arg @@ -482,9 +514,13 @@ if (!isImplicit(lcArg.valueKind_)) { info.explicit_argument_count++; kernel_explicit_args_size += lcArg.size_; + DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } else { info.implicit_argument_count++; hasHiddenArgs = true; + DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i, + lcArg.name_.c_str(), lcArg.size_, lcArg.offset_); } kernel_explicit_args_size += padding; } @@ -492,7 +528,7 @@ // TODO: Probably don't want this arithmetic info.kernel_segment_size = - (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); + (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size); DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), kernel_segment_size, info.kernel_segment_size); diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -124,9 +124,10 @@ uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; + uint16_t CodeObjectVersion; uint32_t kernargSizeIncludingImplicit() { - return KernargSegmentSize + sizeof(impl_implicit_args_t); + return KernargSegmentSize + implicitArgsSize(CodeObjectVersion); } ~KernelArgPool() { @@ -143,8 +144,10 @@ KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; - KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) - : KernargSegmentSize(KernargSegmentSize) { + KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool, + uint16_t CodeObjectVersion) + : KernargSegmentSize(KernargSegmentSize), + CodeObjectVersion(CodeObjectVersion) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue @@ -228,16 +231,16 @@ KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, - hsa_amd_memory_pool_t &KernArgMemoryPool) + hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { - KernelArgPoolMap.insert( - std::make_pair(N, std::unique_ptr(new KernelArgPool( - KernargSegmentSize, KernArgMemoryPool)))); + KernelArgPoolMap.insert(std::make_pair( + N, std::unique_ptr(new KernelArgPool( + KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion)))); } } }; @@ -474,6 +477,7 @@ std::vector WarpSize; std::vector GPUName; std::vector TargetID; + uint16_t CodeObjectVersion; // OpenMP properties std::vector NumTeams; @@ -487,6 +491,7 @@ // Resource pools SignalPoolT FreeSignalPool; + std::vector PreallocatedDeviceHeap; bool HostcallRequired = false; @@ -861,7 +866,6 @@ "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); - // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -1032,6 +1036,7 @@ SymbolInfoTable.resize(NumberOfDevices); DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); DeviceFineGrainedMemoryPools.resize(NumberOfDevices); + PreallocatedDeviceHeap.resize(NumberOfDevices); Err = setupDevicePools(HSAAgents); if (Err != HSA_STATUS_SUCCESS) { @@ -1361,6 +1366,27 @@ return PacketId; } +const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) { + char *ImageBegin = (char *)Image->ImageStart; + size_t ImageSize = (char *)Image->ImageEnd - ImageBegin; + + StringRef Buffer = StringRef(ImageBegin, ImageSize); + auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), + /*InitContent=*/false); + if (!ElfOrErr) { + REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str()); + return 1; + } + + if (const auto *ELFObj = dyn_cast(ElfOrErr->get())) { + auto Header = ELFObj->getELFFile().getHeader(); + uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]); + DP("ELFABIVERSION Version: %u\n", Version); + return Version; + } + return 0; +} + int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { @@ -1401,6 +1427,8 @@ const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); + int NumGroups = 0; + uint16_t ThreadsPerGroup = 0; /* * Set limit based on ThreadsPerGroup and GroupsPerDevice @@ -1416,7 +1444,7 @@ const int WorkgroupSize = LV.WorkgroupSize; if (print_kernel_trace >= LAUNCH) { - int NumGroups = GridSize / WorkgroupSize; + NumGroups = GridSize / WorkgroupSize; // enum modes are SPMD, GENERIC, NONE 0,1,2 // if doing rtl timing, print to stderr, unless stdout requested. bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); @@ -1438,6 +1466,7 @@ } uint64_t PacketId = acquireAvailablePacketId(Queue); + uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion; const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); @@ -1485,14 +1514,11 @@ memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); } - // Initialize implicit arguments. TODO: Which of these can be dropped - impl_implicit_args_t *ImplArgs = reinterpret_cast( - static_cast(KernArg) + ArgPool->KernargSegmentSize); - memset(ImplArgs, 0, - sizeof(impl_implicit_args_t)); // may not be necessary - ImplArgs->offset_x = 0; - ImplArgs->offset_y = 0; - ImplArgs->offset_z = 0; + uint8_t *ImplArgs = + static_cast(KernArg) + sizeof(void *) * ArgNum; + memset(ImplArgs, 0, implicitArgsSize(CodeObjectVersion)); + + uint64_t Buffer = 0; // assign a hostcall buffer for the selected Q if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { @@ -1500,35 +1526,58 @@ // under a multiple reader lock, not a writer lock. static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&HostcallInitLock); - uint64_t Buffer = hostrpc_assign_buffer( - DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); + Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], Queue, + DeviceId); pthread_mutex_unlock(&HostcallInitLock); if (!Buffer) { DP("hostrpc_assign_buffer failed, gpu would dereference null and " "error\n"); return OFFLOAD_FAIL; } + } - DP("Implicit argument count: %d\n", - KernelInfoEntry.implicit_argument_count); - if (KernelInfoEntry.implicit_argument_count >= 4) { - // Initialise pointer for implicit_argument_count != 0 ABI - // Guess that the right implicit argument is at offset 24 after - // the explicit arguments. In the future, should be able to read - // the offset from msgpack. Clang is not annotating it at present. - uint64_t Offset = - sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); - if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { - DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " - "args: %d\n", - Offset + 8, ArgPool->kernargSizeIncludingImplicit()); - } else { - memcpy(static_cast(KernArg) + Offset, &Buffer, 8); - } - } + DP("Implicit argument count: %d\n", + KernelInfoEntry.implicit_argument_count); - // initialise pointer for implicit_argument_count == 0 ABI - ImplArgs->hostcall_ptr = Buffer; + if (CodeObjectVersion < llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { + DP("Setting Hostcall buffer for COV4\n"); + memcpy(&ImplArgs[IMPLICITARGS::COV4_HOSTCALL_PTR_OFFSET], &Buffer, + IMPLICITARGS::HOSTCALL_PTR_SIZE); + } else { + DP("Setting fields of ImplicitArgs for COV5\n"); + uint16_t Remainder = 0; + uint16_t GridDims = 1; + uint32_t NumGroupsYZ = 1; + uint16_t ThreadsPerGroupYZ = 0; + memcpy(&ImplArgs[IMPLICITARGS::COV5_BLOCK_COUNT_X_OFFSET], &NumGroups, + IMPLICITARGS::COV5_BLOCK_COUNT_X_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_BLOCK_COUNT_Y_OFFSET], &NumGroupsYZ, + IMPLICITARGS::COV5_BLOCK_COUNT_Y_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_BLOCK_COUNT_Z_OFFSET], &NumGroupsYZ, + IMPLICITARGS::COV5_BLOCK_COUNT_Z_SIZE); + + memcpy(&ImplArgs[IMPLICITARGS::COV5_GROUP_SIZE_X_OFFSET], + &ThreadsPerGroup, IMPLICITARGS::COV5_GROUP_SIZE_X_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_GROUP_SIZE_Y_OFFSET], + &ThreadsPerGroupYZ, IMPLICITARGS::COV5_GROUP_SIZE_Y_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_GROUP_SIZE_Z_OFFSET], + &ThreadsPerGroupYZ, IMPLICITARGS::COV5_GROUP_SIZE_Z_SIZE); + + memcpy(&ImplArgs[IMPLICITARGS::COV5_REMAINDER_X_OFFSET], &Remainder, + IMPLICITARGS::COV5_REMAINDER_X_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_REMAINDER_Y_OFFSET], &Remainder, + IMPLICITARGS::COV5_REMAINDER_Y_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_REMAINDER_Z_OFFSET], &Remainder, + IMPLICITARGS::COV5_REMAINDER_Z_SIZE); + + memcpy(&ImplArgs[IMPLICITARGS::COV5_GRID_DIMS_OFFSET], &GridDims, + IMPLICITARGS::COV5_GRID_DIMS_SIZE); + + memcpy(&ImplArgs[IMPLICITARGS::COV5_HOSTCALL_PTR_OFFSET], &Buffer, + IMPLICITARGS::HOSTCALL_PTR_SIZE); + memcpy(&ImplArgs[IMPLICITARGS::COV5_HEAPV1_PTR_OFFSET], + &(DeviceInfo().PreallocatedDeviceHeap[DeviceId]), + IMPLICITARGS::COV5_HEAPV1_PTR_SIZE); } Packet->kernarg_address = KernArg; @@ -2149,6 +2198,40 @@ return OFFLOAD_SUCCESS; } +static void preAllocateHeapMemoryForCov5() { + void *DevPtr; + for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) { + DevPtr = nullptr; + size_t PreAllocSize = 131072; // 128KB per device + + hsa_amd_memory_pool_t MemoryPool = + DeviceInfo().DeviceCoarseGrainedMemoryPools[I]; + hsa_status_t Err = + hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr); + if (Err != HSA_STATUS_SUCCESS) { + DP("Error allocating preallocated heap device memory: %s\n", + get_error_string(Err)); + } + + Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL, + DevPtr); + if (Err != HSA_STATUS_SUCCESS) { + DP("hsa allow_access_to_all_gpu_agents failed: %s\n", + get_error_string(Err)); + } + + uint64_t Rounded = + sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t)); + Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t)); + if (Err != HSA_STATUS_SUCCESS) { + DP("Error zero-initializing preallocated heap device memory:%s\n", + get_error_string(Err)); + } + + DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr; + } +} + static __tgt_target_table * __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); @@ -2194,6 +2277,12 @@ if (!elfMachineIdIsAmdgcn(Image)) return NULL; + DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image); + if (DeviceInfo().CodeObjectVersion >= + llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) { + preAllocateHeapMemoryForCov5(); + } + { auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, @@ -2517,7 +2606,8 @@ KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, - DeviceInfo().KernArgPool)); + DeviceInfo().KernArgPool, + DeviceInfo().CodeObjectVersion)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo().addOffloadEntry(DeviceId, Entry);