Index: openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp +++ openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp @@ -47,6 +47,8 @@ DLWRAP(hsa_amd_memory_async_copy, 8); DLWRAP(hsa_amd_memory_pool_get_info, 3); DLWRAP(hsa_amd_agents_allow_access, 4); +DLWRAP(hsa_amd_memory_lock, 5); +DLWRAP(hsa_amd_memory_unlock, 1); DLWRAP(hsa_amd_memory_fill, 3); DLWRAP(hsa_amd_register_system_event_handler, 2); Index: openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h =================================================================== --- openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -76,6 +76,12 @@ const uint32_t *flags, const void *ptr); +hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size, + hsa_agent_t* agents, int num_agent, + void** agent_ptr); + +hsa_status_t hsa_amd_memory_unlock(void* host_ptr); + hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count); typedef enum hsa_amd_event_type_s { Index: openmp/libomptarget/plugins/amdgpu/impl/impl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl.cpp +++ openmp/libomptarget/plugins/amdgpu/impl/impl.cpp @@ -5,9 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "hsa_api.h" -#include "impl_runtime.h" -#include "internal.h" #include "rt.h" #include @@ -15,29 +12,27 @@ * Data */ -static hsa_status_t invoke_hsa_copy(hsa_signal_t sig, void *dest, - const void *src, size_t size, - hsa_agent_t agent) { +// host pointer (either src or dest) must be locked via hsa_amd_memory_lock +static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, + hsa_agent_t agent, const void *src, + size_t size) { const hsa_signal_value_t init = 1; const hsa_signal_value_t success = 0; - hsa_signal_store_screlease(sig, init); + hsa_signal_store_screlease(signal, init); - hsa_status_t err = - hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, NULL, sig); - if (err != HSA_STATUS_SUCCESS) { + hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, + nullptr, signal); + if (err != HSA_STATUS_SUCCESS) return err; - } // async_copy reports success by decrementing and failure by setting to < 0 hsa_signal_value_t got = init; - while (got == init) { - got = hsa_signal_wait_scacquire(sig, HSA_SIGNAL_CONDITION_NE, init, + while (got == init) + got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); - } - if (got != success) { + if (got != success) return HSA_STATUS_ERROR; - } return err; } @@ -48,19 +43,58 @@ } }; +enum CopyDirection { H2D, D2H }; + +static hsa_status_t locking_async_memcpy(enum CopyDirection direction, + hsa_signal_t signal, void *dest, + hsa_agent_t agent, void *src, + void *lockingPtr, size_t size) { + hsa_status_t err; + + void *lockedPtr = nullptr; + err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + + switch (direction) { + case H2D: + err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size); + break; + case D2H: + err = invoke_hsa_copy(signal, lockedPtr, agent, src, size); + break; + default: + err = HSA_STATUS_ERROR; // fall into unlock before returning + } + + if (err != HSA_STATUS_SUCCESS) { + // do not leak locked host pointers, but discard potential error message + hsa_amd_memory_unlock(lockingPtr); + return err; + } + + err = hsa_amd_memory_unlock(lockingPtr); + if (err != HSA_STATUS_SUCCESS) + return err; + + return HSA_STATUS_SUCCESS; +} + hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - const void *hostSrc, size_t size, - hsa_agent_t agent, + void *hostSrc, size_t size, + hsa_agent_t device_agent, hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t rc = hsa_memory_copy(deviceDest, hostSrc, size); + hsa_status_t err; - // hsa_memory_copy sometimes fails in situations where + err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, + device_agent, hostSrc, hostSrc, size); + + if (err == HSA_STATUS_SUCCESS) + return err; + + // async memcpy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. - if (rc == HSA_STATUS_SUCCESS) { - return HSA_STATUS_SUCCESS; - } - void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { @@ -70,26 +104,26 @@ std::unique_ptr del(tempHostPtr); memcpy(tempHostPtr, hostSrc, size); - if (invoke_hsa_copy(signal, deviceDest, tempHostPtr, size, agent) != - HSA_STATUS_SUCCESS) { - return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; + return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, + device_agent, tempHostPtr, tempHostPtr, size); } -hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *dest, - const void *deviceSrc, size_t size, - hsa_agent_t agent, +hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, + void *deviceSrc, size_t size, + hsa_agent_t deviceAgent, hsa_amd_memory_pool_t MemoryPool) { - hsa_status_t rc = hsa_memory_copy(dest, deviceSrc, size); + hsa_status_t err; + + // device has always visibility over both pointers, so use that + err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent, + deviceSrc, hostDest, size); + + if (err == HSA_STATUS_SUCCESS) + return err; // hsa_memory_copy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. - if (rc == HSA_STATUS_SUCCESS) { - return HSA_STATUS_SUCCESS; - } - void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { @@ -98,11 +132,11 @@ } std::unique_ptr del(tempHostPtr); - if (invoke_hsa_copy(signal, tempHostPtr, deviceSrc, size, agent) != - HSA_STATUS_SUCCESS) { + err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr, + deviceAgent, deviceSrc, tempHostPtr, size); + if (err != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; - } - memcpy(dest, tempHostPtr, size); + memcpy(hostDest, tempHostPtr, size); return HSA_STATUS_SUCCESS; } Index: openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h =================================================================== --- openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h +++ openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h @@ -19,13 +19,12 @@ void *cb_state); hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, - const void *hostSrc, size_t size, - hsa_agent_t agent, + void *hostSrc, size_t size, + hsa_agent_t device_agent, hsa_amd_memory_pool_t MemoryPool); -hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, - const void *deviceSrc, size_t size, - hsa_agent_t agent, +hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc, + size_t size, hsa_agent_t device_agent, hsa_amd_memory_pool_t MemoryPool); } Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp =================================================================== --- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -464,10 +464,9 @@ ""); static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size; - using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, - size_t size, hsa_agent_t, - hsa_amd_memory_pool_t); - hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, + using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t size, + hsa_agent_t, hsa_amd_memory_pool_t); + hsa_status_t freesignalpool_memcpy(void *dest, void *src, size_t size, MemcpyFunc Func, int32_t deviceId) { hsa_agent_t agent = HSAAgents[deviceId]; hsa_signal_t s = FreeSignalPool.pop(); @@ -479,13 +478,13 @@ return r; } - hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, - size_t size, int32_t deviceId) { + hsa_status_t freesignalpool_memcpy_d2h(void *dest, void *src, size_t size, + int32_t deviceId) { return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId); } - hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, - size_t size, int32_t deviceId) { + hsa_status_t freesignalpool_memcpy_h2d(void *dest, void *src, size_t size, + int32_t deviceId) { return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId); }