diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1826,14 +1826,33 @@ return Plugin::success(); } + /// Pin the host buffer and return the device pointer that should be used for + /// device transfers. + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + void *PinnedPtr = nullptr; + + hsa_status_t Status = + hsa_amd_memory_lock(HstPtr, Size, nullptr, 0, &PinnedPtr); + if (auto Err = Plugin::check(Status, "Error in hsa_amd_memory_lock: %s\n")) + return Err; + + return PinnedPtr; + } + + /// Unpin the host buffer. + Error dataUnlockImpl(void *HstPtr) override { + hsa_status_t Status = hsa_amd_memory_unlock(HstPtr); + return Plugin::check(Status, "Error in hsa_amd_memory_unlock: %s\n"); + } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { - // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(TgtPtr, HstPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size); } void *PinnedHstPtr = nullptr; @@ -1887,10 +1906,10 @@ AsyncInfoWrapperTy &AsyncInfoWrapper) override { // Use one-step asynchronous operation when host memory is already pinned. - if (isHostPinnedMemoryBuffer(HstPtr)) { - // Use one-step asynchronous operation when host memory is already pinned. + if (void *PinnedPtr = + PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) { AMDGPUStreamTy &Stream = getStream(AsyncInfoWrapper); - return Stream.pushPinnedMemoryCopyAsync(HstPtr, TgtPtr, Size); + return Stream.pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size); } void *PinnedHstPtr = nullptr; diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -252,6 +252,146 @@ uint32_t MaxNumThreads; }; +/// Class representing a map of host pinned allocations. We track these pinned +/// allocations, so memory tranfers invloving these buffers can be optimized. +class PinnedAllocationMapTy { + + /// Struct representing a map entry. + struct EntryTy { + /// The host pointer of the pinned allocation. + void *HstPtr; + + /// The pointer that devices' driver should use to transfer data from/to the + /// pinned allocation. In most plugins, this pointer will be the same as the + /// host pointer above. + void *DevAccessiblePtr; + + /// The size of the pinned allocation. + size_t Size; + + /// The number of references to the pinned allocation. The allocation should + /// remain pinned and registered to the map until the number of references + /// becomes zero. + mutable size_t References; + + /// Create an entry with the host and device acessible pointers, and the + /// buffer size. + EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size) + : HstPtr(HstPtr), DevAccessiblePtr(DevAccessiblePtr), Size(Size), + References(1) {} + + /// Utility constructor used for std::set searches. + EntryTy(void *HstPtr) + : HstPtr(HstPtr), DevAccessiblePtr(nullptr), Size(0), References(0) {} + }; + + /// Comparator of mep entries. Use the host pointer to enforce an order + /// between entries. + struct EntryCmpTy { + bool operator()(const EntryTy &Left, const EntryTy &Right) const { + return Left.HstPtr < Right.HstPtr; + } + }; + + typedef std::set PinnedAllocSetTy; + + /// The map of host pinned allocations. + PinnedAllocSetTy Allocs; + + /// The mutex to protect accesses to the map. + mutable std::shared_mutex Mutex; + + /// Reference to the corresponding device. + GenericDeviceTy &Device; + + /// Find an allocation that intersects with \p Buffer pointer. Assume + /// the map's mutex is acquired. + PinnedAllocSetTy::iterator findIntersecting(const void *Buffer) const { + if (Allocs.empty()) + return Allocs.end(); + + // Search the first allocation with starting address that is not less than + // the buffer address. + auto It = Allocs.lower_bound({const_cast(Buffer)}); + + // Direct match of starting addresses. + if (It != Allocs.end() && It->HstPtr == Buffer) + return It; + + // Not direct match but may be a previous pinned allocation in the map which + // contains the buffer. Return false if there is no such a previous + // allocation. + if (It == Allocs.begin()) + return Allocs.end(); + + // Move to the previous pinned allocation. + --It; + + // The buffer is not contained in the pinned allocation. + if (advanceVoidPtr(It->HstPtr, It->Size) > Buffer) + return It; + + // None found. + return Allocs.end(); + } + +public: + /// Create the map of pinned allocations corresponding to a specific device. + PinnedAllocationMapTy(GenericDeviceTy &Device) : Device(Device) {} + + /// Register a host buffer that was recently locked. None of the already + /// registered pinned allocations should intersect with this new one. The + /// registration requires the host pointer in \p HstPtr, the pointer that the + /// devices should use when transferring data from/to the allocation in + /// \p DevAccessiblePtr, and the size of the allocation in \p Size. Notice + /// that some plugins may use the same pointer for the \p HstPtr and + /// \p DevAccessiblePtr. The allocation must be unregistered using the + /// unregisterHostBuffer function. + Error registerHostBuffer(void *HstPtr, void *DevAccessiblePtr, size_t Size); + + /// Unregister a host pinned allocation passing the host pointer which was + /// previously registered using the registerHostBuffer function. When calling + /// this function, the pinned allocation cannot have any other user. + Error unregisterHostBuffer(void *HstPtr); + + /// Lock the host buffer at \p HstPtr or register a new user if it intersects + /// with an already existing one. A partial overlapping with extension is not + /// allowed. The function returns the device accessible pointer of the pinned + /// buffer. The buffer must be unlocked using the unlockHostBuffer function. + Expected lockHostBuffer(void *HstPtr, size_t Size); + + /// Unlock the host buffer at \p HstPtr or unregister a user if other users + /// are still using the pinned allocation. If this was the last user, the + /// pinned allocation is removed from the map and the memory is unlocked. + Error unlockHostBuffer(void *HstPtr); + + /// Return the device accessible pointer associated to the host pinned + /// allocation which the \p HstPtr belongs, if any. Return null in case the + /// \p HstPtr does not belong to any host pinned allocation. The device + /// accessible pointer is the one that devices should use for data transfers + /// that involve a host pinned buffer. + void *getDeviceAccessiblePtrFromPinnedBuffer(const void *HstPtr) const { + std::shared_lock Lock(Mutex); + + // Find the intersecting allocation if any. + auto It = findIntersecting(HstPtr); + if (It == Allocs.end()) + return nullptr; + + const EntryTy &Entry = *It; + return advanceVoidPtr(Entry.DevAccessiblePtr, + getPtrDiff(HstPtr, Entry.HstPtr)); + } + + /// Check whether a buffer belongs to a registered host pinned allocation. + bool isHostPinnedBuffer(const void *HstPtr) const { + std::shared_lock Lock(Mutex); + + // Return whether there is an intersecting allocation. + return (findIntersecting(const_cast(HstPtr)) != Allocs.end()); + } +}; + /// Class implementing common functionalities of offload devices. Each plugin /// should define the specific device class, derive from this generic one, and /// implement the necessary virtual function members. @@ -310,6 +450,22 @@ /// Deallocate data from the device or involving the device. Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + /// Pin host memory to optimize transfers and return the device accessible + /// pointer that devices should use for memory transfers involving the host + /// pinned allocation. + Expected dataLock(void *HstPtr, int64_t Size) { + return PinnedAllocs.lockHostBuffer(HstPtr, Size); + } + + virtual Expected dataLockImpl(void *HstPtr, int64_t Size) = 0; + + /// Unpin a host memory buffer that was previously pinned. + Error dataUnlock(void *HstPtr) { + return PinnedAllocs.unlockHostBuffer(HstPtr); + } + + virtual Error dataUnlockImpl(void *HstPtr) = 0; + /// Submit data to the device (host to device transfer). Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo); @@ -420,12 +576,6 @@ /// setupDeviceEnvironment() function. virtual bool shouldSetupDeviceEnvironment() const { return true; } - /// Register a host buffer as host pinned allocation. - Error registerHostPinnedMemoryBuffer(const void *Buffer, size_t Size); - - /// Unregister a host pinned allocations. - Error unregisterHostPinnedMemoryBuffer(const void *Buffer); - /// Pointer to the memory manager or nullptr if not available. MemoryManagerTy *MemoryManager; @@ -440,40 +590,7 @@ UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; - /// Map of host pinned allocations. We track these pinned allocations so that - /// memory transfers involving these allocations can be optimized. - std::map HostAllocations; - mutable std::shared_mutex HostAllocationsMutex; - protected: - /// Check whether a buffer has been registered as host pinned memory. - bool isHostPinnedMemoryBuffer(const void *Buffer) const { - std::shared_lock Lock(HostAllocationsMutex); - - if (HostAllocations.empty()) - return false; - - // Search the first allocation with starting address that is not less than - // the buffer address. - auto It = HostAllocations.lower_bound(Buffer); - - // Direct match of starting addresses. - if (It != HostAllocations.end() && It->first == Buffer) - return true; - - // Not direct match but may be a previous pinned allocation in the map which - // contains the buffer. Return false if there is no such a previous - // allocation. - if (It == HostAllocations.begin()) - return false; - - // Move to the previous pinned allocation. - --It; - - // Evaluate whether the buffer is contained in the pinned allocation. - return (advanceVoidPtr(It->first, It->second) > (const char *)Buffer); - } - /// Return the execution mode used for kernel \p Name. Expected getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image); @@ -507,6 +624,9 @@ /// does not mean that device J can access device I's memory directly. llvm::SmallVector PeerAccesses; std::mutex PeerAccessesLock; + + /// Map of host pinned allocations used for optimize device transfers. + PinnedAllocationMapTy PinnedAllocs; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp --- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -333,7 +333,8 @@ OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 32), OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32), DeviceId(DeviceId), GridValues(OMPGridValues), - PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock() { + PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), + PinnedAllocs(*this) { if (OMP_NumTeams > 0) GridValues.GV_Max_Teams = std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams)); @@ -581,23 +582,110 @@ return ExecModeGlobal.getValue(); } -Error GenericDeviceTy::registerHostPinnedMemoryBuffer(const void *Buffer, - size_t Size) { - std::lock_guard Lock(HostAllocationsMutex); +Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr, + void *DevAccessiblePtr, + size_t Size) { + assert(HstPtr && "Invalid pointer"); + assert(DevAccessiblePtr && "Invalid pointer"); - auto Res = HostAllocations.insert({Buffer, Size}); + std::lock_guard Lock(Mutex); + + // No pinned allocation should intersect. + auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size}); if (!Res.second) - return Plugin::error("Registering an already registered pinned buffer"); + return Plugin::error("Cannot register locked buffer"); + + return Plugin::success(); +} + +Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) { + assert(HstPtr && "Invalid pointer"); + + std::lock_guard Lock(Mutex); + + // Find the pinned allocation starting at the host pointer address. + auto It = Allocs.find({HstPtr}); + if (It == Allocs.end()) + return Plugin::error("Cannot find locked buffer"); + + const EntryTy &Entry = *It; + + // There should be no other references to the pinned allocation. + if (Entry.References > 1) + return Plugin::error("The locked buffer is still being used"); + + // Remove the entry from the map. + Allocs.erase(It); return Plugin::success(); } -Error GenericDeviceTy::unregisterHostPinnedMemoryBuffer(const void *Buffer) { - std::lock_guard Lock(HostAllocationsMutex); +Expected PinnedAllocationMapTy::lockHostBuffer(void *HstPtr, + size_t Size) { + assert(HstPtr && "Invalid pointer"); + + std::lock_guard Lock(Mutex); + + auto It = findIntersecting(HstPtr); + + // No intersecting registered allocation found in the map. We must lock and + // register the memory buffer into the map. + if (It == Allocs.end()) { + // First, lock the host buffer and retrieve the device accessible pointer. + auto PinnedPtrOrErr = Device.dataLockImpl(HstPtr, Size); + if (!PinnedPtrOrErr) + return PinnedPtrOrErr.takeError(); + + // Then, insert the host buffer entry into the map. + auto Res = Allocs.insert({HstPtr, *PinnedPtrOrErr, Size}); + if (!Res.second) + return Plugin::error("Cannot register locked buffer"); + + // Return the device accessible pointer. + return *PinnedPtrOrErr; + } + + const EntryTy &Entry = *It; + +#ifdef OMPTARGET_DEBUG + // Do not allow partial overlapping among host pinned buffers. + if (advanceVoidPtr(HstPtr, Size) > advanceVoidPtr(Entry.HstPtr, Entry.Size)) + return Plugin::error("Partial overlapping not allowed in locked memory"); +#endif + + // Increase the number of references. + Entry.References++; + + // Return the device accessible pointer after applying the correct offset. + return advanceVoidPtr(Entry.DevAccessiblePtr, + getPtrDiff(HstPtr, Entry.HstPtr)); +} + +Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) { + assert(HstPtr && "Invalid pointer"); + + std::lock_guard Lock(Mutex); - size_t Erased = HostAllocations.erase(Buffer); + auto It = findIntersecting(HstPtr); + if (It == Allocs.end()) + return Plugin::error("Cannot find locked buffer"); + + const EntryTy &Entry = *It; + + // Decrease the number of references. No need to do anything if there are + // others using the allocation. + if (--Entry.References > 0) + return Plugin::success(); + + // This was the last user of the allocation. Unlock the original locked memory + // buffer, which is the host pointer stored in the entry. + if (auto Err = Device.dataUnlockImpl(Entry.HstPtr)) + return Err; + + // Remove the entry from the map. + size_t Erased = Allocs.erase(Entry); if (!Erased) - return Plugin::error("Cannot find a registered host pinned buffer"); + return Plugin::error("Cannot find locked buffer"); return Plugin::success(); } @@ -648,7 +736,7 @@ // Register allocated buffer as pinned memory if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = registerHostPinnedMemoryBuffer(Alloc, Size)) + if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size)) return Err; return Alloc; @@ -670,7 +758,7 @@ // Unregister deallocated pinned memory buffer if the type is host memory. if (Kind == TARGET_ALLOC_HOST) - if (auto Err = unregisterHostPinnedMemoryBuffer(TgtPtr)) + if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr)) return Err; return Plugin::success(); @@ -998,6 +1086,36 @@ return OFFLOAD_SUCCESS; } +int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size, + void **LockedPtr) { + auto LockedPtrOrErr = Plugin::get().getDevice(DeviceId).dataLock(Ptr, Size); + if (!LockedPtrOrErr) { + auto Err = LockedPtrOrErr.takeError(); + REPORT("Failure to lock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + if (!(*LockedPtrOrErr)) { + REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr); + return OFFLOAD_FAIL; + } + *LockedPtr = *LockedPtrOrErr; + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) { + auto Err = Plugin::get().getDevice(DeviceId).dataUnlock(Ptr); + if (Err) { + REPORT("Failure to unlock memory %p: %s\n", Ptr, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size) { return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -493,6 +493,13 @@ return Plugin::check(Res, "Error in cuStreamQuery: %s"); } + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + // TODO: Register the buffer as CUDA host memory. + return HstPtr; + } + + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override { diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -215,6 +215,15 @@ return OFFLOAD_SUCCESS; } + /// This plugin does nothing to lock buffers. Do not return an error, just + /// return the same pointer as the device pointer. + Expected dataLockImpl(void *HstPtr, int64_t Size) override { + return HstPtr; + } + + /// Nothing to do when unlocking the buffer. + Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); } + /// Submit data to the device (host to device transfer). Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override {