Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1764,18 +1764,33 @@ return Plugin::success(); } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + void *PinnedPtr = nullptr; + + hsa_status_t Status; + Status = hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); + if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) + return Err; + + return PinnedPtr; + } + + Error dataUnlockImpl(void *HstPtr) override { + hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); + return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); + } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { + void *PinnedPtr = nullptr; // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { + if (PinnedAllocs.isHostPinnedBuffer(const_cast(HstPtr), &PinnedPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(TgtPtr, HstPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size); } - void *PinnedHstPtr = nullptr; - // For large transfers use synchronous behavior. if (Size >= OMPX_MaxAsyncCopyBytes) { if (AsyncInfoWrapper.hasQueue()) @@ -1784,7 +1799,7 @@ hsa_status_t Status; Status = hsa_amd_memory_lock(const_cast(HstPtr), Size, nullptr, 0, - &PinnedHstPtr); + &PinnedPtr); if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) return Err; @@ -1793,7 +1808,7 @@ if (auto Err = Signal.init()) return Err; - Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedHstPtr, Agent, + Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size, 0, nullptr, Signal.get()); if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) @@ -1812,27 +1827,25 @@ // Otherwise, use two-step copy with an intermediate pinned host buffer. AMDGPUMemoryManagerTy &PinnedMemoryManager = HostDevice.getPinnedMemoryManager(); - if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedHstPtr)) + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) return Err; AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedHstPtr, Size, + return Stream.pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size, PinnedMemoryManager); } /// Retrieve data from the device (device to host transfer). Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { + void *PinnedPtr = nullptr; // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { - // Use one-step asynchronous operation when host memory is already pinned. + if (PinnedAllocs.isHostPinnedBuffer(HstPtr, &PinnedPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(HstPtr, TgtPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(const_cast(PinnedPtr), TgtPtr, Size); } - void *PinnedHstPtr = nullptr; - // For large transfers use synchronous behavior. if (Size >= OMPX_MaxAsyncCopyBytes) { if (AsyncInfoWrapper.hasQueue()) @@ -1841,7 +1854,7 @@ hsa_status_t Status; Status = hsa_amd_memory_lock(const_cast(HstPtr), Size, nullptr, 0, - &PinnedHstPtr); + &PinnedPtr); if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) return Err; @@ -1850,7 +1863,7 @@ if (auto Err = Signal.init()) return Err; - Status = hsa_amd_memory_async_copy(PinnedHstPtr, Agent, TgtPtr, Agent, + Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size, 0, nullptr, Signal.get()); if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) @@ -1869,11 +1882,11 @@ // Otherwise, use two-step copy with an intermediate pinned host buffer. AMDGPUMemoryManagerTy &PinnedMemoryManager = HostDevice.getPinnedMemoryManager(); - if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedHstPtr)) + if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr)) return Err; AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedHstPtr, Size, + return Stream.pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size, PinnedMemoryManager); } Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -245,6 +245,107 @@ uint32_t MaxNumThreads; }; +/// Class representing a map of host pinned allocations. We track these pinned +/// allocations, so memory tranfers invloving these buffers can be optimized. +class PinnedAllocationMapTy { + + /// Struct representing a map entry. + struct EntryTy { + /// The host pointer of the pinned allocation. + void *HstPtr; + + /// The pointer that devices' driver should use to transfer data from/to the + /// pinned allocation. In most plugins, this pointer will be the same as the + /// host pointer above. + void *DevPtr; + + /// The size of the pinned allocation. + size_t Size; + + /// Create an entry with the host and device pointers, and the buffer size. + EntryTy(void *HstPtr, void *DevPtr, size_t Size) + : HstPtr(HstPtr), DevPtr(DevPtr), Size(Size) {} + + /// Utility constructor used for std::set searches. + EntryTy(void *HstPtr) : HstPtr(HstPtr), DevPtr(nullptr), Size(0) {} + }; + + /// Comparator of mep entries. Use the host pointer to enforce an order + /// between entries. + struct EntryCmpTy { + bool operator()(const EntryTy& Left, const EntryTy& Right) const { + return Left.HstPtr < Right.HstPtr; + } + }; + + /// The map of host pinned allocations. + std::set Allocs; + + /// The mutex to protect accesses to the map. + mutable std::shared_mutex Mutex; + + /// Find an allocation that intersects with a specific buffer pointer. + const EntryTy *findIntersecting(void *Buffer) const { + if (Allocs.empty()) + return nullptr; + + // Search the first allocation with starting address that is not less than + // the buffer address. + auto It = Allocs.lower_bound({Buffer}); + + // Direct match of starting addresses. + if (It != Allocs.end() && It->HstPtr == Buffer) + return &(*It); + + // Not direct match but may be a previous pinned allocation in the map which + // contains the buffer. Return false if there is no such a previous + // allocation. + if (It == Allocs.begin()) + return nullptr; + + // Move to the previous pinned allocation. + --It; + + // The buffer is not contained in the pinned allocation. + if ((char *)It->HstPtr + It->Size > (char *)Buffer) + return &(*It); + + // None found. + return nullptr; + } + +public: + /// Check whether a buffer belongs to a registered as host pinned memory. In + /// case it is pinned, store in \p DevPtr the pointer that the devices can use + /// for async memory copies. + bool isHostPinnedBuffer(void *HstPtr, void **DevPtr = nullptr) const { + std::shared_lock Lock(Mutex); + + // Find the intersecting allocation if any. + const EntryTy *Entry = findIntersecting(HstPtr); + if (!Entry) + return false; + + if (DevPtr) { + // TODO: Use helper function from D140719 patch. + ptrdiff_t Offset = ((char *)HstPtr) - ((char *)Entry->HstPtr); + *DevPtr = ((char *)Entry->DevPtr) + Offset; + } + + return true; + } + + /// Register a host buffer as host pinned memory. The allocation requires the + /// host pointer in \p HstPtr, the pointer that the devices' driver should use + /// when copying data from/to the allocation in \p DevPtr, and the size of the + /// allocation in \p Size. Notice that most plugins may use the same pointer + /// for the \p HstPtr and \p DevPtr. + Error registerHostPinnedBuffer(void *HstPtr, void *DevPtr, size_t Size); + + /// Unregister a host pinned allocation. + Error unregisterHostPinnedBuffer(void *HstPtr); +}; + /// Class implementing common functionalities of offload devices. Each plugin /// should define the specific device class, derive from this generic one, and /// implement the necessary virtual function members. @@ -303,6 +404,16 @@ /// Deallocate data from the device or involving the device. Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + /// Pin host memory for DMA transfers and return the memory pointer that + /// devices' driver should use for memory transfers involving the pinned + /// allocations. + Expected dataLock(void *HstPtr, int64_t Size); + virtual Expected dataLockImpl(void *HstPtr, int64_t Size) = 0; + + /// Unpin a host memory buffer that was previously pinned. + Error dataUnlock(void *HstPtr); + virtual Error dataUnlockImpl(void *HstPtr) = 0; + /// Submit data to the device (host to device transfer). Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo); @@ -418,12 +529,6 @@ /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Register a host buffer as host pinned allocation. - Error registerHostPinnedMemoryBuffer(const void *Buffer, size_t Size); - - /// Unregister a host pinned allocations. - Error unregisterHostPinnedMemoryBuffer(const void *Buffer); - /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; @@ -438,39 +543,7 @@ UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; - /// Map of host pinned allocations. We track these pinned allocations so that - /// memory transfers involving these allocations can be optimized. - std::map HostAllocations; - mutable std::shared_mutex HostAllocationsMutex; - protected: - /// Check whether a buffer has been registered as host pinned memory. - bool isHostPinnedMemoryBuffer(const void *Buffer) const { - std::shared_lock Lock(HostAllocationsMutex); - - if (HostAllocations.empty()) - return false; - - // Search the first allocation with starting address that is not less than - // the buffer address. - auto It = HostAllocations.lower_bound(Buffer); - - // Direct match of starting addresses. - if (It != HostAllocations.end() && It->first == Buffer) - return true; - - // Not direct match but may be a previous pinned allocation in the map which - // contains the buffer. Return false if there is no such a previous - // allocation. - if (It == HostAllocations.begin()) - return false; - - // Move to the previous pinned allocation. - --It; - - // Evaluate whether the buffer is contained in the pinned allocation. - return ((const char *)It->first + It->second > (const char *)Buffer); - } /// Environment variables defined by the LLVM OpenMP implementation /// regarding the initial number of streams and events. @@ -501,6 +574,9 @@ /// does not mean that device J can access device I's memory directly. llvm::SmallVector PeerAccesses; std::mutex PeerAccessesLock; + + /// Map of host pinned allocations used for optimize device transfers. + PinnedAllocationMapTy PinnedAllocs; }; /// Class implementing common functionalities of offload plugins. Each plugin Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -357,21 +357,25 @@ return Plugin::success(); } -Error GenericDeviceTy::registerHostPinnedMemoryBuffer(const void *Buffer, - size_t Size) { - std::lock_guard Lock(HostAllocationsMutex); +Error PinnedAllocationMapTy::registerHostPinnedBuffer(void *HstPtr, void *DevPtr, size_t Size) { + assert(HstPtr && "Invalid host buffer"); + assert(DevPtr && "Invalid device buffer"); - auto Res = HostAllocations.insert({Buffer, Size}); + std::lock_guard Lock(Mutex); + + auto Res = Allocs.insert({HstPtr, DevPtr, Size}); if (!Res.second) return Plugin::error("Registering an already registered pinned buffer"); return Plugin::success(); } -Error GenericDeviceTy::unregisterHostPinnedMemoryBuffer(const void *Buffer) { - std::lock_guard Lock(HostAllocationsMutex); +Error PinnedAllocationMapTy::unregisterHostPinnedBuffer(void *HstPtr) { + assert(HstPtr && "Invalid host buffer"); + + std::lock_guard Lock(Mutex); - size_t Erased = HostAllocations.erase(Buffer); + size_t Erased = Allocs.erase({HstPtr}); if (!Erased) return Plugin::error("Cannot find a registered host pinned buffer"); @@ -421,7 +425,7 @@ // Register allocated buffer as pinned memory if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = registerHostPinnedMemoryBuffer(Alloc, Size)) + if (auto Err = PinnedAllocs.registerHostPinnedBuffer(Alloc, Alloc, Size)) return Err; return Alloc; @@ -439,12 +443,41 @@ // Unregister deallocated pinned memory buffer if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = unregisterHostPinnedMemoryBuffer(TgtPtr)) + if (auto Err = PinnedAllocs.unregisterHostPinnedBuffer(TgtPtr)) return Err; return Plugin::success(); } +Expected GenericDeviceTy::dataLock(void *HstPtr, int64_t Size) { + if (PinnedAllocs.isHostPinnedBuffer(HstPtr)) + return Plugin::error("Host buffer already pinned"); + + auto PinnedPtrOrErr = dataLockImpl(HstPtr, Size); + if (!PinnedPtrOrErr) + return PinnedPtrOrErr.takeError(); + + // Do nothing if the plugin does not support pinned memory. + void *PinnedPtr = *PinnedPtrOrErr; + if (!PinnedPtr) + return nullptr; + + if (auto Err = PinnedAllocs.registerHostPinnedBuffer(HstPtr, PinnedPtr, Size)) + return Err; + + return PinnedPtr; +} + +Error GenericDeviceTy::dataUnlock(void *HstPtr) { + if (!PinnedAllocs.isHostPinnedBuffer(HstPtr)) + return Plugin::error("Unknown host pinned buffer"); + + if (auto Err = dataUnlockImpl(HstPtr)) + return Err; + + return PinnedAllocs.unregisterHostPinnedBuffer(HstPtr); +} + Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { auto Err = Plugin::success(); @@ -773,6 +806,25 @@ return (bool)Err; } +void *__tgt_rtl_data_lock(int DeviceId, void *Ptr, int64_t Size) { + auto PinnedPtrOrErr = Plugin::get().getDevice(DeviceId).dataLock(Ptr, Size); + if (!PinnedPtrOrErr) { + auto Err = PinnedPtrOrErr.takeError(); + REPORT("Failure to lock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); + return nullptr; + } + + return *PinnedPtrOrErr; +} + +void __tgt_rtl_data_unlock(int DeviceId, void *Ptr) { + auto Err = Plugin::get().getDevice(DeviceId).dataUnlock(Ptr); + if (Err) + REPORT("Failure to unlock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); +} + int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size) { return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -512,6 +512,13 @@ return Plugin::check(Res, "Error in cuStreamQuery: %s"); } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + // TODO: Register the buffer as CUDA host memory. + return HstPtr; + } + + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -215,6 +215,13 @@ return OFFLOAD_SUCCESS; } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + // Do nothing. + return HstPtr; + } + + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override {