Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1816,14 +1816,33 @@ return Plugin::success(); } + /// Pin the host buffer and return the device pointer that should be used for + /// device transfers. + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + void *PinnedPtr = nullptr; + + hsa_status_t Status = + hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); + if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) + return Err; + + return PinnedPtr; + } + + /// Unpin the host buffer. + Error dataUnlockImpl(void *HstPtr) override { + hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); + return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); + } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { - // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(TgtPtr, HstPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size); } void *PinnedHstPtr = nullptr; @@ -1877,10 +1896,10 @@ AsyncInfoWrapperTy &AsyncInfoWrapper) override { // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { - // Use one-step asynchronous operation when host memory is already pinned. + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(HstPtr, TgtPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size); } void *PinnedHstPtr = nullptr; Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -245,6 +245,117 @@ uint32_t MaxNumThreads; }; +/// Class representing a map of host pinned allocations. We track these pinned +/// allocations, so memory tranfers invloving these buffers can be optimized. +class PinnedAllocationMapTy { + + /// Struct representing a map entry. + struct EntryTy { + /// The host pointer of the pinned allocation. + void *HstPtr; + + /// The pointer that devices' driver should use to transfer data from/to the + /// pinned allocation. In most plugins, this pointer will be the same as the + /// host pointer above. + void *DevAccessiblePtr; + + /// The size of the pinned allocation. + size_t Size; + + /// Create an entry with the host and device acessible pointers, and the + /// buffer size. + EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size) + : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size) {} + + /// Utility constructor used for std::set searches. + EntryTy(void *HstPtr) + : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0) {} + }; + + /// Comparator of mep entries. Use the host pointer to enforce an order + /// between entries. + struct EntryCmpTy { + bool operator()(const EntryTy &Left, const EntryTy &Right) const { + return Left.HstPtr < Right.HstPtr; + } + }; + + /// The map of host pinned allocations. + std::set Allocs; + + /// The mutex to protect accesses to the map. + mutable std::shared_mutex Mutex; + + /// Find an allocation that intersects with a specific buffer pointer. Assume + /// the map's mutex is acquired. + const EntryTy *findIntersecting(const void *Buffer) const { + if (Allocs.empty()) + return nullptr; + + // Search the first allocation with starting address that is not less than + // the buffer address. + auto It = Allocs.lower_bound({const_cast(Buffer)}); + + // Direct match of starting addresses. + if (It != Allocs.end() && It->HstPtr == Buffer) + return &(*It); + + // Not direct match but may be a previous pinned allocation in the map which + // contains the buffer. Return false if there is no such a previous + // allocation. + if (It == Allocs.begin()) + return nullptr; + + // Move to the previous pinned allocation. + --It; + + // The buffer is not contained in the pinned allocation. + if ((char *)It->HstPtr + It->Size > (const char *)Buffer) + return &(*It); + + // None found. + return nullptr; + } + +public: + /// Register a host buffer as host pinned memory. The registration requires + /// the host pointer in \p HstPtr, the pointer that the devices should use + /// when transferring data from/to the allocation in \p DevAccessiblePtr, and + /// the size of the allocation in \p Size. Notice that some plugins may use + /// the same pointer for the \p HstPtr and \p DevAccessiblePtr. + Error registerHostPinnedBuffer(void *HstPtr, void *DevAccessiblePtr, + size_t Size); + + /// Unregister a host pinned allocation passing the host pointer. + Error unregisterHostPinnedBuffer(void *HstPtr); + + /// Return the device accessible pointer associated to the host pinned + /// allocation which the \p HstPtr belongs, if any. Return null in case the + /// \p HstPtr does not belong to any host pinned allocation. The device + /// accessible pointer is the one that devices should use for data transfers + /// that involve a host pinned buffer. + void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const { + std::shared_lock Lock(Mutex); + + // Find the intersecting allocation if any. + const EntryTy *Entry = findIntersecting(HstPtr); + if (!Entry) + return nullptr; + + // TODO: Use helper function from D140719 patch. + ptrdiff_t Offset = ((const char *)HstPtr) - ((char *)Entry->HstPtr); + return ((char *)Entry->DevAccessiblePtr) + Offset; + } + + /// Check whether a buffer belongs to a registered host pinned allocation. + bool isHostPinnedBuffer(const void *HstPtr) const { + std::shared_lock Lock(Mutex); + + // Return whether there is an intersecting allocation. + return (findIntersecting(const_cast(HstPtr)) != nullptr); + } +}; + /// Class implementing common functionalities of offload devices. Each plugin /// should define the specific device class, derive from this generic one, and /// implement the necessary virtual function members. @@ -303,6 +414,16 @@ /// Deallocate data from the device or involving the device. Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + /// Pin host memory to optimize transfers and return the device accessible + /// pointer that devices should use for memory transfers involving the host + /// pinned allocation. + Expected dataLock(void *HstPtr, int64_t Size); + virtual Expected dataLockImpl(void *HstPtr, int64_t Size) = 0; + + /// Unpin a host memory buffer that was previously pinned. + Error dataUnlock(void *HstPtr); + virtual Error dataUnlockImpl(void *HstPtr) = 0; + /// Submit data to the device (host to device transfer). Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo); @@ -418,12 +539,6 @@ /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Register a host buffer as host pinned allocation. - Error registerHostPinnedMemoryBuffer(const void *Buffer, size_t Size); - - /// Unregister a host pinned allocations. - Error unregisterHostPinnedMemoryBuffer(const void *Buffer); - /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; @@ -438,40 +553,7 @@ UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; - /// Map of host pinned allocations. We track these pinned allocations so that - /// memory transfers involving these allocations can be optimized. - std::map HostAllocations; - mutable std::shared_mutex HostAllocationsMutex; - protected: - /// Check whether a buffer has been registered as host pinned memory. - bool isHostPinnedMemoryBuffer(const void *Buffer) const { - std::shared_lock Lock(HostAllocationsMutex); - - if (HostAllocations.empty()) - return false; - - // Search the first allocation with starting address that is not less than - // the buffer address. - auto It = HostAllocations.lower_bound(Buffer); - - // Direct match of starting addresses. - if (It != HostAllocations.end() && It->first == Buffer) - return true; - - // Not direct match but may be a previous pinned allocation in the map which - // contains the buffer. Return false if there is no such a previous - // allocation. - if (It == HostAllocations.begin()) - return false; - - // Move to the previous pinned allocation. - --It; - - // Evaluate whether the buffer is contained in the pinned allocation. - return ((const char *)It->first + It->second > (const char *)Buffer); - } - /// Return the execution mode used for kernel \p Name. Expected getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image); @@ -505,6 +587,9 @@ /// does not mean that device J can access device I's memory directly. llvm::SmallVector PeerAccesses; std::mutex PeerAccessesLock; + + /// Map of host pinned allocations used for optimize device transfers. + PinnedAllocationMapTy PinnedAllocs; }; /// Class implementing common functionalities of offload plugins. Each plugin Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -388,21 +388,27 @@ return ExecModeGlobal.getValue(); } -Error GenericDeviceTy::registerHostPinnedMemoryBuffer(const void *Buffer, +Error PinnedAllocationMapTy::registerHostPinnedBuffer(void *HstPtr, + void *DevAccessiblePtr, size_t Size) { - std::lock_guard Lock(HostAllocationsMutex); + assert(HstPtr && "Invalid pointer"); + assert(DevAccessiblePtr && "Invalid pointer"); - auto Res = HostAllocations.insert({Buffer, Size}); + std::lock_guard Lock(Mutex); + + auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size}); if (!Res.second) return Plugin::error("Registering an already registered pinned buffer"); return Plugin::success(); } -Error GenericDeviceTy::unregisterHostPinnedMemoryBuffer(const void *Buffer) { - std::lock_guard Lock(HostAllocationsMutex); +Error PinnedAllocationMapTy::unregisterHostPinnedBuffer(void *HstPtr) { + assert(HstPtr && "Invalid pointer"); + + std::lock_guard Lock(Mutex); - size_t Erased = HostAllocations.erase(Buffer); + size_t Erased = Allocs.erase({HstPtr}); if (!Erased) return Plugin::error("Cannot find a registered host pinned buffer"); @@ -452,7 +458,7 @@ // Register allocated buffer as pinned memory if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = registerHostPinnedMemoryBuffer(Alloc, Size)) + if (auto Err = PinnedAllocs.registerHostPinnedBuffer(Alloc, Alloc, Size)) return Err; return Alloc; @@ -470,12 +476,41 @@ // Unregister deallocated pinned memory buffer if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = unregisterHostPinnedMemoryBuffer(TgtPtr)) + if (auto Err = PinnedAllocs.unregisterHostPinnedBuffer(TgtPtr)) return Err; return Plugin::success(); } +Expected GenericDeviceTy::dataLock(void *HstPtr, int64_t Size) { + if (PinnedAllocs.isHostPinnedBuffer(HstPtr)) + return Plugin::error("Host buffer already pinned"); + + auto PinnedPtrOrErr = dataLockImpl(HstPtr, Size); + if (!PinnedPtrOrErr) + return PinnedPtrOrErr.takeError(); + + // Do nothing if the plugin does not support pinned memory. + void *PinnedPtr = *PinnedPtrOrErr; + if (!PinnedPtr) + return nullptr; + + if (auto Err = PinnedAllocs.registerHostPinnedBuffer(HstPtr, PinnedPtr, Size)) + return Err; + + return PinnedPtr; +} + +Error GenericDeviceTy::dataUnlock(void *HstPtr) { + if (!PinnedAllocs.isHostPinnedBuffer(HstPtr)) + return Plugin::error("Unknown host pinned buffer"); + + if (auto Err = dataUnlockImpl(HstPtr)) + return Err; + + return PinnedAllocs.unregisterHostPinnedBuffer(HstPtr); +} + Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { auto Err = Plugin::success(); @@ -815,6 +850,36 @@ return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size, + void **LockedPtr) { + auto LockedPtrOrErr = Plugin::get().getDevice(DeviceId).dataLock(Ptr, Size); + if (!LockedPtrOrErr) { + auto Err = LockedPtrOrErr.takeError(); + REPORT("Failure to lock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + if (!(*LockedPtrOrErr)) { + REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr); + return OFFLOAD_FAIL; + } + *LockedPtr = *LockedPtrOrErr; + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) { + auto Err = Plugin::get().getDevice(DeviceId).dataUnlock(Ptr); + if (Err) { + REPORT("Failure to unlock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size) { return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -494,6 +494,13 @@ return Plugin::check(Res, "Error in cuStreamQuery: %s"); } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + // TODO: Register the buffer as CUDA host memory. + return HstPtr; + } + + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -215,6 +215,15 @@ return OFFLOAD_SUCCESS; } + /// This plugin does nothing to lock buffers. Do not return an error, just + /// return the same pointer as the device pointer. + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + return HstPtr; + } + + /// Nothing to do when unlocking the buffer. + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override {